Detect Microdroid hangup during boot
Hangup in Microdroid is defined as a state where payload hasn't been started for a long time. In that case AVF kills the VM and the death is reported via onDied callback. In addition, modified the client-facing java and rust libraries to add death reasons that were added before but haven't surfaced yet. Bug: 222228861 Test: I couldn't make a test for this because it was impossible to intentionally make the hang by a test. Instead, I confirm that `onDied` is called and the VM eventually is killed when I edited the timeout value to a very small number (e.g. 100ms). Change-Id: I53f232d0b609e6e8a429d996c7d6fdd0b37e7b4c
This commit is contained in:
parent
71215c64cc
commit
e6ed0f92f4
|
@ -439,6 +439,7 @@ public class VirtualMachine {
|
|||
}
|
||||
@Override
|
||||
public void onDied(int cid, int reason) {
|
||||
// TODO(b/236811123) translate `reason` into a stable reason numbers
|
||||
service.asBinder().unlinkToDeath(deathRecipient, 0);
|
||||
if (onDiedCalled.compareAndSet(false, true)) {
|
||||
executeCallback((cb) -> cb.onDied(VirtualMachine.this, reason));
|
||||
|
|
|
@ -66,7 +66,8 @@ public interface VirtualMachineCallback {
|
|||
DEATH_REASON_SHUTDOWN,
|
||||
DEATH_REASON_ERROR,
|
||||
DEATH_REASON_REBOOT,
|
||||
DEATH_REASON_CRASH
|
||||
DEATH_REASON_CRASH,
|
||||
DEATH_REASON_HANGUP,
|
||||
})
|
||||
@interface DeathReason {}
|
||||
|
||||
|
@ -97,6 +98,36 @@ public interface VirtualMachineCallback {
|
|||
/** The VM or crosvm crashed. */
|
||||
int DEATH_REASON_CRASH = 6;
|
||||
|
||||
/** The pVM firmware failed to verify the VM because the public key doesn't match. */
|
||||
int DEATH_REASON_PVM_FIRMWARE_PUBLIC_KEY_MISMATCH = 7;
|
||||
|
||||
/** The pVM firmware failed to verify the VM because the instance image changed. */
|
||||
int DEATH_REASON_PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED = 8;
|
||||
|
||||
/** The bootloader failed to verify the VM because the public key doesn't match. */
|
||||
int DEATH_REASON_BOOTLOADER_PUBLIC_KEY_MISMATCH = 9;
|
||||
|
||||
/** The bootloader failed to verify the VM because the instance image changed. */
|
||||
int DEATH_REASON_BOOTLOADER_INSTANCE_IMAGE_CHANGED = 10;
|
||||
|
||||
/** The microdroid failed to connect to VirtualizationService's RPC server. */
|
||||
int DEATH_REASON_MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE = 11;
|
||||
|
||||
/** The payload for microdroid is changed. */
|
||||
int DEATH_REASON_MICRODROID_PAYLOAD_HAS_CHANGED = 12;
|
||||
|
||||
/** The microdroid failed to verify given payload APK. */
|
||||
int DEATH_REASON_MICRODROID_PAYLOAD_VERIFICATION_FAILED = 13;
|
||||
|
||||
/** The VM config for microdroid is invalid (e.g. missing tasks). */
|
||||
int DEATH_REASON_MICRODROID_INVALID_PAYLOAD_CONFIG = 14;
|
||||
|
||||
/** There was a runtime error while running microdroid manager. */
|
||||
int DEATH_REASON_MICRODROID_UNKNOWN_RUNTIME_ERROR = 15;
|
||||
|
||||
/** The VM killed due to hangup */
|
||||
int DEATH_REASON_HANGUP = 16;
|
||||
|
||||
/** Called when the payload starts in the VM. */
|
||||
void onPayloadStarted(@NonNull VirtualMachine vm, @Nullable ParcelFileDescriptor stream);
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ rust_defaults {
|
|||
"libcommand_fds",
|
||||
"libdisk",
|
||||
"libidsig",
|
||||
"liblazy_static",
|
||||
"liblog_rust",
|
||||
"libmicrodroid_metadata",
|
||||
"libmicrodroid_payload_config",
|
||||
|
|
|
@ -52,4 +52,6 @@ enum DeathReason {
|
|||
MICRODROID_INVALID_PAYLOAD_CONFIG = 14,
|
||||
/** There was a runtime error while running microdroid manager. */
|
||||
MICRODROID_UNKNOWN_RUNTIME_ERROR = 15,
|
||||
/** The VM killed due to hangup */
|
||||
HANGUP = 16,
|
||||
}
|
||||
|
|
|
@ -479,6 +479,7 @@ impl VirtualizationService {
|
|||
log_fd,
|
||||
indirect_files,
|
||||
platform_version: parse_platform_version_req(&config.platformVersion)?,
|
||||
detect_hangup: is_app_config,
|
||||
};
|
||||
let instance = Arc::new(
|
||||
VmInstance::new(
|
||||
|
|
|
@ -18,10 +18,13 @@ use crate::aidl::VirtualMachineCallbacks;
|
|||
use crate::Cid;
|
||||
use anyhow::{bail, Error};
|
||||
use command_fds::CommandFdExt;
|
||||
use lazy_static::lazy_static;
|
||||
use log::{debug, error, info};
|
||||
use semver::{Version, VersionReq};
|
||||
use nix::{fcntl::OFlag, unistd::pipe2};
|
||||
use rustutils::system_properties;
|
||||
use shared_child::SharedChild;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::{remove_dir_all, File};
|
||||
use std::io::{self, Read};
|
||||
use std::mem;
|
||||
|
@ -29,7 +32,8 @@ use std::num::NonZeroU32;
|
|||
use std::os::unix::io::{AsRawFd, RawFd, FromRawFd};
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::time::Duration;
|
||||
use std::thread;
|
||||
use vsock::VsockStream;
|
||||
use android_system_virtualizationservice::aidl::android::system::virtualizationservice::DeathReason::DeathReason;
|
||||
|
@ -51,6 +55,25 @@ const CROSVM_REBOOT_STATUS: i32 = 32;
|
|||
/// The exit status which crosvm returns when it crashes due to an error.
|
||||
const CROSVM_CRASH_STATUS: i32 = 33;
|
||||
|
||||
fn is_nested_virtualization() -> bool {
|
||||
// Check if we are running on vsoc as a proxy for this.
|
||||
matches!(
|
||||
system_properties::read("ro.build.product").unwrap().as_deref(),
|
||||
Some("vsoc_x86_64") | Some("vsoc_x86")
|
||||
)
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
/// If the VM doesn't move to the Started state within this amount time, a hang-up error is
|
||||
/// triggered.
|
||||
static ref BOOT_HANGUP_TIMEOUT: Duration = if is_nested_virtualization() {
|
||||
// Nested virtualization is slow, so we need a longer timeout.
|
||||
Duration::from_secs(100)
|
||||
} else {
|
||||
Duration::from_secs(10)
|
||||
};
|
||||
}
|
||||
|
||||
/// Configuration for a VM to run with crosvm.
|
||||
#[derive(Debug)]
|
||||
pub struct CrosvmConfig {
|
||||
|
@ -69,6 +92,7 @@ pub struct CrosvmConfig {
|
|||
pub log_fd: Option<File>,
|
||||
pub indirect_files: Vec<File>,
|
||||
pub platform_version: VersionReq,
|
||||
pub detect_hangup: bool,
|
||||
}
|
||||
|
||||
/// A disk image to pass to crosvm for a VM.
|
||||
|
@ -116,6 +140,7 @@ impl VmState {
|
|||
fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
|
||||
let state = mem::replace(self, VmState::Failed);
|
||||
if let VmState::NotStarted { config } = state {
|
||||
let detect_hangup = config.detect_hangup;
|
||||
let (failure_pipe_read, failure_pipe_write) = create_pipe()?;
|
||||
|
||||
// If this fails and returns an error, `self` will be left in the `Failed` state.
|
||||
|
@ -123,7 +148,7 @@ impl VmState {
|
|||
|
||||
let child_clone = child.clone();
|
||||
thread::spawn(move || {
|
||||
instance.monitor(child_clone, failure_pipe_read);
|
||||
instance.monitor(child_clone, failure_pipe_read, detect_hangup);
|
||||
});
|
||||
|
||||
// If it started correctly, update the state.
|
||||
|
@ -162,6 +187,8 @@ pub struct VmInstance {
|
|||
pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
|
||||
/// The latest lifecycle state which the payload reported itself to be in.
|
||||
payload_state: Mutex<PayloadState>,
|
||||
/// Represents the condition that payload_state becomes Started
|
||||
payload_started: Condvar,
|
||||
}
|
||||
|
||||
impl VmInstance {
|
||||
|
@ -188,6 +215,7 @@ impl VmInstance {
|
|||
stream: Mutex::new(None),
|
||||
vm_service: Mutex::new(None),
|
||||
payload_state: Mutex::new(PayloadState::Starting),
|
||||
payload_started: Condvar::new(),
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -198,11 +226,38 @@ impl VmInstance {
|
|||
}
|
||||
|
||||
/// Waits for the crosvm child process to finish, then marks the VM as no longer running and
|
||||
/// calls any callbacks.
|
||||
/// calls any callbacks. If `detect_hangup` is optionally set to true, waits for the start of
|
||||
/// payload in the crosvm process. If that doesn't occur within a BOOT_HANGUP_TIMEOUT, declare
|
||||
/// it as a hangup and forcibly kill the process.
|
||||
///
|
||||
/// This takes a separate reference to the `SharedChild` rather than using the one in
|
||||
/// `self.vm_state` to avoid holding the lock on `vm_state` while it is running.
|
||||
fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File) {
|
||||
fn monitor(&self, child: Arc<SharedChild>, mut failure_pipe_read: File, detect_hangup: bool) {
|
||||
let hungup = if detect_hangup {
|
||||
// Wait until payload is started or the crosvm process terminates. The checking of the
|
||||
// child process is needed because otherwise we will be waiting for a condition that
|
||||
// will never be satisfied (because crosvm is the one who can make the condition true).
|
||||
let state = self.payload_state.lock().unwrap();
|
||||
let (_, result) = self
|
||||
.payload_started
|
||||
.wait_timeout_while(state, *BOOT_HANGUP_TIMEOUT, |state| {
|
||||
*state < PayloadState::Started && child.try_wait().is_ok()
|
||||
})
|
||||
.unwrap();
|
||||
if result.timed_out() {
|
||||
error!(
|
||||
"Microdroid failed to start payload within {} secs timeout. Shutting down",
|
||||
BOOT_HANGUP_TIMEOUT.as_secs()
|
||||
);
|
||||
self.kill();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let result = child.wait();
|
||||
match &result {
|
||||
Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
|
||||
|
@ -214,14 +269,17 @@ impl VmInstance {
|
|||
// Ensure that the mutex is released before calling the callbacks.
|
||||
drop(vm_state);
|
||||
|
||||
let mut failure_string = String::new();
|
||||
let failure_read_result = failure_pipe_read.read_to_string(&mut failure_string);
|
||||
if let Err(e) = &failure_read_result {
|
||||
error!("Error reading VM failure reason from pipe: {}", e);
|
||||
}
|
||||
if !failure_string.is_empty() {
|
||||
info!("VM returned failure reason '{}'", failure_string);
|
||||
}
|
||||
let failure_string = if hungup {
|
||||
Cow::from("HANGUP")
|
||||
} else {
|
||||
let mut s = String::new();
|
||||
match failure_pipe_read.read_to_string(&mut s) {
|
||||
Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
|
||||
Ok(len) if len > 0 => info!("VM returned failure reason '{}'", &s),
|
||||
_ => (),
|
||||
};
|
||||
Cow::from(s)
|
||||
};
|
||||
|
||||
self.callbacks.callback_on_died(self.cid, death_reason(&result, &failure_string));
|
||||
|
||||
|
@ -243,6 +301,9 @@ impl VmInstance {
|
|||
// the other direction.
|
||||
if new_state > *state_locked {
|
||||
*state_locked = new_state;
|
||||
if new_state >= PayloadState::Started {
|
||||
self.payload_started.notify_all();
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
|
||||
|
@ -289,6 +350,7 @@ fn death_reason(result: &Result<ExitStatus, io::Error>, failure_reason: &str) ->
|
|||
"MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
|
||||
return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
|
||||
}
|
||||
"HANGUP" => return DeathReason::HANGUP,
|
||||
_ => {}
|
||||
}
|
||||
match status.code() {
|
||||
|
|
|
@ -44,6 +44,18 @@ pub enum DeathReason {
|
|||
BootloaderPublicKeyMismatch,
|
||||
/// The bootloader failed to verify the VM because the instance image changed.
|
||||
BootloaderInstanceImageChanged,
|
||||
/// The microdroid failed to connect to VirtualizationService's RPC server.
|
||||
MicrodroidFailedToConnectToVirtualizationService,
|
||||
/// The payload for microdroid is changed.
|
||||
MicrodroidPayloadHasChanged,
|
||||
/// The microdroid failed to verify given payload APK.
|
||||
MicrodroidPayloadVerificationFailed,
|
||||
/// The VM config for microdroid is invalid (e.g. missing tasks).
|
||||
MicrodroidInvalidPayloadConfig,
|
||||
/// There was a runtime error while running microdroid manager.
|
||||
MicrodroidUnknownRuntimeError,
|
||||
/// The VM was killed due to hangup.
|
||||
Hangup,
|
||||
/// VirtualizationService sent a death reason which was not recognised by the client library.
|
||||
Unrecognised(AidlDeathReason),
|
||||
}
|
||||
|
@ -66,6 +78,20 @@ impl From<AidlDeathReason> for DeathReason {
|
|||
AidlDeathReason::BOOTLOADER_INSTANCE_IMAGE_CHANGED => {
|
||||
Self::BootloaderInstanceImageChanged
|
||||
}
|
||||
AidlDeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE => {
|
||||
Self::MicrodroidFailedToConnectToVirtualizationService
|
||||
}
|
||||
AidlDeathReason::MICRODROID_PAYLOAD_HAS_CHANGED => Self::MicrodroidPayloadHasChanged,
|
||||
AidlDeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED => {
|
||||
Self::MicrodroidPayloadVerificationFailed
|
||||
}
|
||||
AidlDeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG => {
|
||||
Self::MicrodroidInvalidPayloadConfig
|
||||
}
|
||||
AidlDeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR => {
|
||||
Self::MicrodroidUnknownRuntimeError
|
||||
}
|
||||
AidlDeathReason::HANGUP => Self::Hangup,
|
||||
_ => Self::Unrecognised(reason),
|
||||
}
|
||||
}
|
||||
|
@ -94,6 +120,20 @@ impl Display for DeathReason {
|
|||
Self::BootloaderInstanceImageChanged => {
|
||||
"Bootloader failed to verify the VM because the instance image changed."
|
||||
}
|
||||
Self::MicrodroidFailedToConnectToVirtualizationService => {
|
||||
"The microdroid failed to connect to VirtualizationService's RPC server."
|
||||
}
|
||||
Self::MicrodroidPayloadHasChanged => "The payload for microdroid is changed.",
|
||||
Self::MicrodroidPayloadVerificationFailed => {
|
||||
"The microdroid failed to verify given payload APK."
|
||||
}
|
||||
Self::MicrodroidInvalidPayloadConfig => {
|
||||
"The VM config for microdroid is invalid (e.g. missing tasks)."
|
||||
}
|
||||
Self::MicrodroidUnknownRuntimeError => {
|
||||
"There was a runtime error while running microdroid manager."
|
||||
}
|
||||
Self::Hangup => "VM hangup.",
|
||||
Self::Unrecognised(reason) => {
|
||||
return write!(f, "Unrecognised death reason {:?}.", reason);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue