Wait for crosvm in a separate thread, and keep track of when it dies.

This lets us tell whether a VM is still running or has finished. Also
add callback so clients can get notified when crosvm dies.

Bug: 171277638
Test: Ran on VIM3L
Test: atest VirtualizationTestCases
Change-Id: I52c1625af45cfcfe7aa0be465ea08f427ec5bc43
This commit is contained in:
Andrew Walbran 2021-03-12 17:05:20 +00:00
parent 8a13d2555b
commit dae07167f6
7 changed files with 204 additions and 50 deletions

View File

@ -14,6 +14,7 @@ rust_binary {
"liblog_rust",
"libserde_json",
"libserde",
"libshared_child",
"libanyhow",
],
apex_available: ["com.android.virt"],

View File

@ -15,7 +15,18 @@
*/
package android.system.virtmanager;
import android.system.virtmanager.IVirtualMachineCallback;
interface IVirtualMachine {
/** Get the CID allocated to the VM. */
int getCid();
/** Returns true if the VM is still running, or false if it has exited for any reason. */
boolean isRunning();
/**
* Register a Binder object to get callbacks when the state of the VM changes, such as if it
* dies.
*/
void registerCallback(IVirtualMachineCallback callback);
}

View File

@ -0,0 +1,32 @@
/*
* Copyright 2021 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package android.system.virtmanager;
import android.system.virtmanager.IVirtualMachine;
/**
* An object which a client may register with the Virt Manager to get callbacks about the state of
* a particular VM.
*/
oneway interface IVirtualMachineCallback {
/**
* Called when the VM dies.
*
* Note that this will not be called if the Virt Manager itself dies, so you should also use
* `link_to_death` to handle that.
*/
void onDied(int cid);
}

View File

@ -31,4 +31,7 @@ parcelable VirtualMachineDebugInfo {
* the PID may have been reused for a different process, so this should not be trusted.
*/
int requesterPid;
/** Whether the VM is still running. */
boolean running;
}

View File

@ -21,11 +21,12 @@ use android_system_virtmanager::aidl::android::system::virtmanager::IVirtManager
use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachine::{
BnVirtualMachine, IVirtualMachine,
};
use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachineCallback::IVirtualMachineCallback;
use android_system_virtmanager::aidl::android::system::virtmanager::VirtualMachineDebugInfo::VirtualMachineDebugInfo;
use android_system_virtmanager::binder::{
self, Interface, ParcelFileDescriptor, StatusCode, Strong, ThreadState,
};
use log::error;
use log::{debug, error};
use std::ffi::CStr;
use std::fs::File;
use std::sync::{Arc, Mutex, Weak};
@ -54,7 +55,6 @@ impl IVirtManager for VirtManager {
log_fd: Option<&ParcelFileDescriptor>,
) -> binder::Result<Strong<dyn IVirtualMachine>> {
let state = &mut *self.state.lock().unwrap();
let cid = state.next_cid;
let log_fd = log_fd
.map(|fd| fd.as_ref().try_clone().map_err(|_| StatusCode::UNKNOWN_ERROR))
.transpose()?;
@ -69,16 +69,9 @@ impl IVirtManager for VirtManager {
})
});
let requester_pid = ThreadState::get_calling_pid();
let instance = Arc::new(start_vm(
config_fd.as_ref(),
cid,
log_fd,
requester_uid,
requester_sid,
requester_pid,
)?);
// TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
state.next_cid = state.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
let cid = state.allocate_cid()?;
let instance =
start_vm(config_fd.as_ref(), cid, log_fd, requester_uid, requester_sid, requester_pid)?;
state.add_vm(Arc::downgrade(&instance));
Ok(VirtualMachine::create(instance))
}
@ -99,6 +92,7 @@ impl IVirtManager for VirtManager {
requesterUid: vm.requester_uid as i32,
requesterSid: vm.requester_sid.clone(),
requesterPid: vm.requester_pid,
running: vm.running(),
})
.collect();
Ok(cids)
@ -155,6 +149,48 @@ impl IVirtualMachine for VirtualMachine {
fn getCid(&self) -> binder::Result<i32> {
Ok(self.instance.cid as i32)
}
fn isRunning(&self) -> binder::Result<bool> {
Ok(self.instance.running())
}
fn registerCallback(
&self,
callback: &Strong<dyn IVirtualMachineCallback>,
) -> binder::Result<()> {
// TODO: Should this give an error if the VM is already dead?
self.instance.callbacks.add(callback.clone());
Ok(())
}
}
impl Drop for VirtualMachine {
fn drop(&mut self) {
debug!("Dropping {:?}", self);
self.instance.kill();
}
}
/// A set of Binders to be called back in response to various events on the VM, such as when it
/// dies.
#[derive(Debug, Default)]
pub struct VirtualMachineCallbacks(Mutex<Vec<Strong<dyn IVirtualMachineCallback>>>);
impl VirtualMachineCallbacks {
/// Call all registered callbacks to say that the VM has died.
pub fn callback_on_died(&self, cid: Cid) {
let callbacks = &*self.0.lock().unwrap();
for callback in callbacks {
if let Err(e) = callback.onDied(cid as i32) {
error!("Error calling callback: {}", e);
}
}
}
/// Add a new callback to the set.
fn add(&self, callback: Strong<dyn IVirtualMachineCallback>) {
self.0.lock().unwrap().push(callback);
}
}
/// The mutable state of the Virt Manager. There should only be one instance of this struct.
@ -175,7 +211,7 @@ struct State {
}
impl State {
/// Get a list of VMs which are currently running.
/// Get a list of VMs which still have Binder references to them.
fn vms(&self) -> Vec<Arc<VmInstance>> {
// Attempt to upgrade the weak pointers to strong pointers.
self.vms.iter().filter_map(Weak::upgrade).collect()
@ -200,6 +236,14 @@ impl State {
let pos = self.debug_held_vms.iter().position(|vm| vm.getCid() == Ok(cid))?;
Some(self.debug_held_vms.swap_remove(pos))
}
/// Get the next available CID, or an error if we have run out.
fn allocate_cid(&mut self) -> binder::Result<Cid> {
// TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
let cid = self.next_cid;
self.next_cid = self.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
Ok(cid)
}
}
impl Default for State {
@ -217,7 +261,7 @@ fn start_vm(
requester_uid: u32,
requester_sid: Option<String>,
requester_pid: i32,
) -> binder::Result<VmInstance> {
) -> binder::Result<Arc<VmInstance>> {
let config = VmConfig::load(config_file).map_err(|e| {
error!("Failed to load VM config from {:?}: {:?}", config_file, e);
StatusCode::BAD_VALUE

View File

@ -14,12 +14,17 @@
//! Functions for running instances of `crosvm`.
use crate::aidl::VirtualMachineCallbacks;
use crate::config::VmConfig;
use crate::Cid;
use anyhow::Error;
use log::{debug, error, info};
use log::{error, info};
use shared_child::SharedChild;
use std::fs::File;
use std::process::{Child, Command};
use std::process::Command;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
@ -27,7 +32,7 @@ const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
#[derive(Debug)]
pub struct VmInstance {
/// The crosvm child process.
child: Child,
child: SharedChild,
/// The CID assigned to the VM for vsock communication.
pub cid: Cid,
/// The UID of the process which requested the VM.
@ -37,18 +42,30 @@ pub struct VmInstance {
/// The PID of the process which requested the VM. Note that this process may no longer exist
/// and the PID may have been reused for a different process, so this should not be trusted.
pub requester_pid: i32,
/// Whether the VM is still running.
running: AtomicBool,
/// Callbacks to clients of the VM.
pub callbacks: VirtualMachineCallbacks,
}
impl VmInstance {
/// Create a new `VmInstance` for the given process.
fn new(
child: Child,
child: SharedChild,
cid: Cid,
requester_uid: u32,
requester_sid: Option<String>,
requester_pid: i32,
) -> VmInstance {
VmInstance { child, cid, requester_uid, requester_sid, requester_pid }
VmInstance {
child,
cid,
requester_uid,
requester_sid,
requester_pid,
running: AtomicBool::new(true),
callbacks: Default::default(),
}
}
/// Start an instance of `crosvm` to manage a new VM. The `crosvm` instance will be killed when
@ -60,29 +77,46 @@ impl VmInstance {
requester_uid: u32,
requester_sid: Option<String>,
requester_pid: i32,
) -> Result<VmInstance, Error> {
) -> Result<Arc<VmInstance>, Error> {
let child = run_vm(config, cid, log_fd)?;
Ok(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid))
}
}
let instance =
Arc::new(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid));
impl Drop for VmInstance {
fn drop(&mut self) {
debug!("Dropping {:?}", self);
let instance_clone = instance.clone();
thread::spawn(move || {
instance_clone.monitor();
});
Ok(instance)
}
/// Wait for the crosvm child process to finish, then mark the VM as no longer running and call
/// any callbacks.
fn monitor(&self) {
match self.child.wait() {
Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
Ok(status) => info!("crosvm exited with status {}", status),
}
self.running.store(false, Ordering::Release);
self.callbacks.callback_on_died(self.cid);
}
/// Return whether `crosvm` is still running the VM.
pub fn running(&self) -> bool {
self.running.load(Ordering::Acquire)
}
/// Kill the crosvm instance.
pub fn kill(&self) {
// TODO: Talk to crosvm to shutdown cleanly.
if let Err(e) = self.child.kill() {
error!("Error killing crosvm instance: {}", e);
}
// We need to wait on the process after killing it to avoid zombies.
match self.child.wait() {
Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
Ok(status) => info!("Crosvm exited with status {}", status),
}
}
}
/// Start an instance of `crosvm` to manage a new VM.
fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<Child, Error> {
fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<SharedChild, Error> {
config.validate()?;
let mut command = Command::new(CROSVM_PATH);
@ -110,6 +144,5 @@ fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<Child, Er
command.arg(kernel);
}
info!("Running {:?}", command);
// TODO: Monitor child process, and remove from VM map if it dies.
Ok(command.spawn()?)
Ok(SharedChild::spawn(&mut command)?)
}

View File

@ -17,9 +17,14 @@
mod sync;
use android_system_virtmanager::aidl::android::system::virtmanager::IVirtManager::IVirtManager;
use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachine::IVirtualMachine;
use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachineCallback::{
BnVirtualMachineCallback, IVirtualMachineCallback,
};
use android_system_virtmanager::binder::{
get_interface, DeathRecipient, IBinder, ParcelFileDescriptor, ProcessState, Strong,
};
use android_system_virtmanager::binder::{Interface, Result as BinderResult};
use anyhow::{Context, Error};
use std::fs::File;
use std::io;
@ -91,14 +96,26 @@ fn command_run(
// Pass the VM reference back to Virt Manager and have it hold it in the background.
virt_manager.debugHoldVmRef(&vm).context("Failed to pass VM to Virt Manager")
} else {
// Wait until the VM dies. If we just returned immediately then the IVirtualMachine Binder
// object would be dropped and the VM would be killed.
wait_for_death(&mut vm.as_binder())?;
println!("VM died");
Ok(())
// Wait until the VM or VirtManager dies. If we just returned immediately then the
// IVirtualMachine Binder object would be dropped and the VM would be killed.
wait_for_vm(vm)
}
}
/// Wait until the given VM or the VirtManager itself dies.
fn wait_for_vm(vm: Strong<dyn IVirtualMachine>) -> Result<(), Error> {
let dead = AtomicFlag::default();
let callback =
BnVirtualMachineCallback::new_binder(VirtualMachineCallback { dead: dead.clone() });
vm.registerCallback(&callback)?;
let death_recipient = wait_for_death(&mut vm.as_binder(), dead.clone())?;
dead.wait();
// Ensure that death_recipient isn't dropped before we wait on the flag, as it is removed
// from the Binder when it's dropped.
drop(death_recipient);
Ok(())
}
/// Retrieve reference to a previously daemonized VM and stop it.
fn command_stop(virt_manager: Strong<dyn IVirtManager>, cid: u32) -> Result<(), Error> {
virt_manager
@ -115,18 +132,31 @@ fn command_list(virt_manager: Strong<dyn IVirtManager>) -> Result<(), Error> {
Ok(())
}
/// Block until the given Binder object dies.
fn wait_for_death(binder: &mut impl IBinder) -> Result<(), Error> {
let dead = AtomicFlag::default();
let mut death_recipient = {
let dead = dead.clone();
DeathRecipient::new(move || {
dead.raise();
})
};
/// Raise the given flag when the given Binder object dies.
///
/// If the returned DeathRecipient is dropped then this will no longer do anything.
fn wait_for_death(binder: &mut impl IBinder, dead: AtomicFlag) -> Result<DeathRecipient, Error> {
let mut death_recipient = DeathRecipient::new(move || {
println!("VirtManager died");
dead.raise();
});
binder.link_to_death(&mut death_recipient)?;
dead.wait();
Ok(())
Ok(death_recipient)
}
#[derive(Debug)]
struct VirtualMachineCallback {
dead: AtomicFlag,
}
impl Interface for VirtualMachineCallback {}
impl IVirtualMachineCallback for VirtualMachineCallback {
fn onDied(&self, _cid: i32) -> BinderResult<()> {
println!("VM died");
self.dead.raise();
Ok(())
}
}
/// Safely duplicate the standard output file descriptor.