Detect Microdroid hangup during boot

Hangup in Microdroid is defined as a state where payload hasn't been
started for a long time. In that case AVF kills the VM and the death is
reported via onDied callback.

In addition, modified the client-facing java and rust libraries to add
death reasons that were added before but haven't surfaced yet.

Bug: 222228861
Test: I couldn't make a test for this because it was impossible to
intentionally make the hang by a test. Instead, I confirm that `onDied`
is called and the VM eventually is killed when I edited the timeout
value to a very small number (e.g. 100ms).

Change-Id: I53f232d0b609e6e8a429d996c7d6fdd0b37e7b4c
This commit is contained in:
Jiyong Park 2022-06-22 00:13:00 +09:00
parent 71215c64cc
commit e6ed0f92f4
7 changed files with 151 additions and 13 deletions

View File

@ -439,6 +439,7 @@ public class VirtualMachine {
}
@Override
public void onDied(int cid, int reason) {
// TODO(b/236811123) translate `reason` into a stable reason numbers
service.asBinder().unlinkToDeath(deathRecipient, 0);
if (onDiedCalled.compareAndSet(false, true)) {
executeCallback((cb) -> cb.onDied(VirtualMachine.this, reason));

View File

@ -66,7 +66,8 @@ public interface VirtualMachineCallback {
DEATH_REASON_SHUTDOWN,
DEATH_REASON_ERROR,
DEATH_REASON_REBOOT,
DEATH_REASON_CRASH
DEATH_REASON_CRASH,
DEATH_REASON_HANGUP,
})
@interface DeathReason {}
@ -97,6 +98,36 @@ public interface VirtualMachineCallback {
/** The VM or crosvm crashed. */
int DEATH_REASON_CRASH = 6;
/** The pVM firmware failed to verify the VM because the public key doesn't match. */
int DEATH_REASON_PVM_FIRMWARE_PUBLIC_KEY_MISMATCH = 7;
/** The pVM firmware failed to verify the VM because the instance image changed. */
int DEATH_REASON_PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED = 8;
/** The bootloader failed to verify the VM because the public key doesn't match. */
int DEATH_REASON_BOOTLOADER_PUBLIC_KEY_MISMATCH = 9;
/** The bootloader failed to verify the VM because the instance image changed. */
int DEATH_REASON_BOOTLOADER_INSTANCE_IMAGE_CHANGED = 10;
/** The microdroid failed to connect to VirtualizationService's RPC server. */
int DEATH_REASON_MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE = 11;
/** The payload for microdroid is changed. */
int DEATH_REASON_MICRODROID_PAYLOAD_HAS_CHANGED = 12;
/** The microdroid failed to verify given payload APK. */
int DEATH_REASON_MICRODROID_PAYLOAD_VERIFICATION_FAILED = 13;
/** The VM config for microdroid is invalid (e.g. missing tasks). */
int DEATH_REASON_MICRODROID_INVALID_PAYLOAD_CONFIG = 14;
/** There was a runtime error while running microdroid manager. */
int DEATH_REASON_MICRODROID_UNKNOWN_RUNTIME_ERROR = 15;
/** The VM killed due to hangup */
int DEATH_REASON_HANGUP = 16;
/** Called when the payload starts in the VM. */
void onPayloadStarted(@NonNull VirtualMachine vm, @Nullable ParcelFileDescriptor stream);

View File

@ -31,6 +31,7 @@ rust_defaults {
"libcommand_fds",
"libdisk",
"libidsig",
"liblazy_static",
"liblog_rust",
"libmicrodroid_metadata",
"libmicrodroid_payload_config",

View File

@ -52,4 +52,6 @@ enum DeathReason {
MICRODROID_INVALID_PAYLOAD_CONFIG = 14,
/** There was a runtime error while running microdroid manager. */
MICRODROID_UNKNOWN_RUNTIME_ERROR = 15,
/** The VM killed due to hangup */
HANGUP = 16,
}

View File

@ -479,6 +479,7 @@ impl VirtualizationService {
log_fd,
indirect_files,
platform_version: parse_platform_version_req(&config.platformVersion)?,
detect_hangup: is_app_config,
};
let instance = Arc::new(
VmInstance::new(

View File

@ -18,10 +18,13 @@ use crate::aidl::VirtualMachineCallbacks;
use crate::Cid;
use anyhow::{bail, Error};
use command_fds::CommandFdExt;
use lazy_static::lazy_static;
use log::{debug, error, info};
use semver::{Version, VersionReq};
use nix::{fcntl::OFlag, unistd::pipe2};
use rustutils::system_properties;
use shared_child::SharedChild;
use std::borrow::Cow;
use std::fs::{remove_dir_all, File};
use std::io::{self, Read};
use std::mem;
@ -29,7 +32,8 @@ use std::num::NonZeroU32;
use std::os::unix::io::{AsRawFd, RawFd, FromRawFd};
use std::path::PathBuf;
use std::process::{Command, ExitStatus};
use std::sync::{Arc, Mutex};
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
use std::thread;
use vsock::VsockStream;
use android_system_virtualizationservice::aidl::android::system::virtualizationservice::DeathReason::DeathReason;
@ -51,6 +55,25 @@ const CROSVM_REBOOT_STATUS: i32 = 32;
/// The exit status which crosvm returns when it crashes due to an error.
const CROSVM_CRASH_STATUS: i32 = 33;
fn is_nested_virtualization() -> bool {
// Check if we are running on vsoc as a proxy for this.
matches!(
system_properties::read("ro.build.product").unwrap().as_deref(),
Some("vsoc_x86_64") | Some("vsoc_x86")
)
}
lazy_static! {
/// If the VM doesn't move to the Started state within this amount time, a hang-up error is
/// triggered.
static ref BOOT_HANGUP_TIMEOUT: Duration = if is_nested_virtualization() {
// Nested virtualization is slow, so we need a longer timeout.
Duration::from_secs(100)
} else {
Duration::from_secs(10)
};
}
/// Configuration for a VM to run with crosvm.
#[derive(Debug)]
pub struct CrosvmConfig {
@ -69,6 +92,7 @@ pub struct CrosvmConfig {
pub log_fd: Option<File>,
pub indirect_files: Vec<File>,
pub platform_version: VersionReq,
pub detect_hangup: bool,
}
/// A disk image to pass to crosvm for a VM.
@ -116,6 +140,7 @@ impl VmState {
fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
let state = mem::replace(self, VmState::Failed);
if let VmState::NotStarted { config } = state {
let detect_hangup = config.detect_hangup;
let (failure_pipe_read, failure_pipe_write) = create_pipe()?;
// If this fails and returns an error, `self` will be left in the `Failed` state.
@ -123,7 +148,7 @@ impl VmState {
let child_clone = child.clone();
thread::spawn(move || {
instance.monitor(child_clone, failure_pipe_read);
instance.monitor(child_clone, failure_pipe_read, detect_hangup);
});
// If it started correctly, update the state.
@ -162,6 +187,8 @@ pub struct VmInstance {
pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
/// The latest lifecycle state which the payload reported itself to be in.
payload_state: Mutex<PayloadState>,
/// Represents the condition that payload_state becomes Started
payload_started: Condvar,
}
impl VmInstance {
@ -188,6 +215,7 @@ impl VmInstance {
stream: Mutex::new(None),
vm_service: Mutex::new(None),
payload_state: Mutex::new(PayloadState::Starting),
payload_started: Condvar::new(),
})
}
@ -198,11 +226,38 @@ impl VmInstance {
}
/// Waits for the crosvm child process to finish, then marks the VM as no longer running and
/// calls any callbacks.
/// calls any callbacks. If `detect_hangup` is optionally set to true, waits for the start of
/// payload in the crosvm process. If that doesn't occur within a BOOT_HANGUP_TIMEOUT, declare
/// it as a hangup and forcibly kill the process.
///
/// This takes a separate reference to the `SharedChild` rather than using the one in
/// `self.vm_state` to avoid holding the lock on `vm_state` while it is running.
fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File) {
fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File, detect_hangup: bool) {
let hungup = if detect_hangup {
// Wait until payload is started or the crosvm process terminates. The checking of the
// child process is needed because otherwise we will be waiting for a condition that
// will never be satisfied (because crosvm is the one who can make the condition true).
let state = self.payload_state.lock().unwrap();
let (_, result) = self
.payload_started
.wait_timeout_while(state, *BOOT_HANGUP_TIMEOUT, |state| {
*state < PayloadState::Started && child.try_wait().is_ok()
})
.unwrap();
if result.timed_out() {
error!(
"Microdroid failed to start payload within {} secs timeout. Shutting down",
BOOT_HANGUP_TIMEOUT.as_secs()
);
self.kill();
true
} else {
false
}
} else {
false
};
let result = child.wait();
match &result {
Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
@ -214,14 +269,17 @@ impl VmInstance {
// Ensure that the mutex is released before calling the callbacks.
drop(vm_state);
let mut failure_string = String::new();
let failure_read_result = failure_pipe_read.read_to_string(&mut failure_string);
if let Err(e) = &failure_read_result {
error!("Error reading VM failure reason from pipe: {}", e);
}
if !failure_string.is_empty() {
info!("VM returned failure reason '{}'", failure_string);
}
let failure_string = if hungup {
Cow::from("HANGUP")
} else {
let mut s = String::new();
match failure_pipe_read.read_to_string(&mut s) {
Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &s),
_ => (),
};
Cow::from(s)
};
self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_string));
@ -243,6 +301,9 @@ impl VmInstance {
// the other direction.
if new_state > *state_locked {
*state_locked = new_state;
if new_state >= PayloadState::Started {
self.payload_started.notify_all();
}
Ok(())
} else {
bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
@ -289,6 +350,7 @@ fn death_reason(result: &Result<ExitStatus, io::Error>, failure_reason: &str) ->
"MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
}
"HANGUP" => return DeathReason::HANGUP,
_ => {}
}
match status.code() {

View File

@ -44,6 +44,18 @@ pub enum DeathReason {
BootloaderPublicKeyMismatch,
/// The bootloader failed to verify the VM because the instance image changed.
BootloaderInstanceImageChanged,
/// The microdroid failed to connect to VirtualizationService's RPC server.
MicrodroidFailedToConnectToVirtualizationService,
/// The payload for microdroid is changed.
MicrodroidPayloadHasChanged,
/// The microdroid failed to verify given payload APK.
MicrodroidPayloadVerificationFailed,
/// The VM config for microdroid is invalid (e.g. missing tasks).
MicrodroidInvalidPayloadConfig,
/// There was a runtime error while running microdroid manager.
MicrodroidUnknownRuntimeError,
/// The VM was killed due to hangup.
Hangup,
/// VirtualizationService sent a death reason which was not recognised by the client library.
Unrecognised(AidlDeathReason),
}
@ -66,6 +78,20 @@ impl From<AidlDeathReason> for DeathReason {
AidlDeathReason::BOOTLOADER_INSTANCE_IMAGE_CHANGED => {
Self::BootloaderInstanceImageChanged
}
AidlDeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE => {
Self::MicrodroidFailedToConnectToVirtualizationService
}
AidlDeathReason::MICRODROID_PAYLOAD_HAS_CHANGED => Self::MicrodroidPayloadHasChanged,
AidlDeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED => {
Self::MicrodroidPayloadVerificationFailed
}
AidlDeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG => {
Self::MicrodroidInvalidPayloadConfig
}
AidlDeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR => {
Self::MicrodroidUnknownRuntimeError
}
AidlDeathReason::HANGUP => Self::Hangup,
_ => Self::Unrecognised(reason),
}
}
@ -94,6 +120,20 @@ impl Display for DeathReason {
Self::BootloaderInstanceImageChanged => {
"Bootloader failed to verify the VM because the instance image changed."
}
Self::MicrodroidFailedToConnectToVirtualizationService => {
"The microdroid failed to connect to VirtualizationService's RPC server."
}
Self::MicrodroidPayloadHasChanged => "The payload for microdroid is changed.",
Self::MicrodroidPayloadVerificationFailed => {
"The microdroid failed to verify given payload APK."
}
Self::MicrodroidInvalidPayloadConfig => {
"The VM config for microdroid is invalid (e.g. missing tasks)."
}
Self::MicrodroidUnknownRuntimeError => {
"There was a runtime error while running microdroid manager."
}
Self::Hangup => "VM hangup.",
Self::Unrecognised(reason) => {
return write!(f, "Unrecognised death reason {:?}.", reason);
}