Wait for crosvm in a separate thread, and keep track of when it dies.

This lets us tell whether a VM is still running or has finished. Also add callback so clients can get notified when crosvm dies. Bug: 171277638 Test: Ran on VIM3L Test: atest VirtualizationTestCases Change-Id: I52c1625af45cfcfe7aa0be465ea08f427ec5bc43
2021-03-12 17:05:20 +00:00 · 2021-03-12 17:05:20 +00:00 · dae07167f6
parent 8a13d2555b
commit dae07167f6
7 changed files with 204 additions and 50 deletions
--- a/virtmanager/Android.bp
+++ b/virtmanager/Android.bp
@ -14,6 +14,7 @@ rust_binary {
        "liblog_rust",
        "libserde_json",
        "libserde",
+        "libshared_child",
        "libanyhow",
    ],
    apex_available: ["com.android.virt"],
--- a/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl
+++ b/virtmanager/aidl/android/system/virtmanager/IVirtualMachine.aidl
@ -15,7 +15,18 @@
 */
 package android.system.virtmanager;

+import android.system.virtmanager.IVirtualMachineCallback;
+
 interface IVirtualMachine {
    /** Get the CID allocated to the VM. */
    int getCid();
+
+    /** Returns true if the VM is still running, or false if it has exited for any reason. */
+    boolean isRunning();
+
+    /**
+     * Register a Binder object to get callbacks when the state of the VM changes, such as if it
+     * dies.
+     */
+    void registerCallback(IVirtualMachineCallback callback);
 }
--- a/virtmanager/aidl/android/system/virtmanager/IVirtualMachineCallback.aidl
+++ b/virtmanager/aidl/android/system/virtmanager/IVirtualMachineCallback.aidl
@ -0,0 +1,32 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package android.system.virtmanager;
+
+import android.system.virtmanager.IVirtualMachine;
+
+/**
+ * An object which a client may register with the Virt Manager to get callbacks about the state of
+ * a particular VM.
+ */
+oneway interface IVirtualMachineCallback {
+    /**
+     * Called when the VM dies.
+     *
+     * Note that this will not be called if the Virt Manager itself dies, so you should also use
+     * `link_to_death` to handle that.
+     */
+    void onDied(int cid);
+}
--- a/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl
+++ b/virtmanager/aidl/android/system/virtmanager/VirtualMachineDebugInfo.aidl
@ -31,4 +31,7 @@ parcelable VirtualMachineDebugInfo {
     * the PID may have been reused for a different process, so this should not be trusted.
     */
    int requesterPid;
+
+    /** Whether the VM is still running. */
+    boolean running;
 }
--- a/virtmanager/src/aidl.rs
+++ b/virtmanager/src/aidl.rs
@ -21,11 +21,12 @@ use android_system_virtmanager::aidl::android::system::virtmanager::IVirtManager
 use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachine::{
    BnVirtualMachine, IVirtualMachine,
 };
+use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachineCallback::IVirtualMachineCallback;
 use android_system_virtmanager::aidl::android::system::virtmanager::VirtualMachineDebugInfo::VirtualMachineDebugInfo;
 use android_system_virtmanager::binder::{
    self, Interface, ParcelFileDescriptor, StatusCode, Strong, ThreadState,
 };
-use log::error;
+use log::{debug, error};
 use std::ffi::CStr;
 use std::fs::File;
 use std::sync::{Arc, Mutex, Weak};
@ -54,7 +55,6 @@ impl IVirtManager for VirtManager {
        log_fd: Option<&ParcelFileDescriptor>,
    ) -> binder::Result<Strong<dyn IVirtualMachine>> {
        let state = &mut *self.state.lock().unwrap();
-        let cid = state.next_cid;
        let log_fd = log_fd
            .map(|fd| fd.as_ref().try_clone().map_err(|_| StatusCode::UNKNOWN_ERROR))
            .transpose()?;
@ -69,16 +69,9 @@ impl IVirtManager for VirtManager {
            })
        });
        let requester_pid = ThreadState::get_calling_pid();
-        let instance = Arc::new(start_vm(
-            config_fd.as_ref(),
-            cid,
-            log_fd,
-            requester_uid,
-            requester_sid,
-            requester_pid,
-        )?);
-        // TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
-        state.next_cid = state.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
+        let cid = state.allocate_cid()?;
+        let instance =
+            start_vm(config_fd.as_ref(), cid, log_fd, requester_uid, requester_sid, requester_pid)?;
        state.add_vm(Arc::downgrade(&instance));
        Ok(VirtualMachine::create(instance))
    }
@ -99,6 +92,7 @@ impl IVirtManager for VirtManager {
                requesterUid: vm.requester_uid as i32,
                requesterSid: vm.requester_sid.clone(),
                requesterPid: vm.requester_pid,
+                running: vm.running(),
            })
            .collect();
        Ok(cids)
@ -155,6 +149,48 @@ impl IVirtualMachine for VirtualMachine {
    fn getCid(&self) -> binder::Result<i32> {
        Ok(self.instance.cid as i32)
    }
+
+    fn isRunning(&self) -> binder::Result<bool> {
+        Ok(self.instance.running())
+    }
+
+    fn registerCallback(
+        &self,
+        callback: &Strong<dyn IVirtualMachineCallback>,
+    ) -> binder::Result<()> {
+        // TODO: Should this give an error if the VM is already dead?
+        self.instance.callbacks.add(callback.clone());
+        Ok(())
+    }
+}
+
+impl Drop for VirtualMachine {
+    fn drop(&mut self) {
+        debug!("Dropping {:?}", self);
+        self.instance.kill();
+    }
+}
+
+/// A set of Binders to be called back in response to various events on the VM, such as when it
+/// dies.
+#[derive(Debug, Default)]
+pub struct VirtualMachineCallbacks(Mutex<Vec<Strong<dyn IVirtualMachineCallback>>>);
+
+impl VirtualMachineCallbacks {
+    /// Call all registered callbacks to say that the VM has died.
+    pub fn callback_on_died(&self, cid: Cid) {
+        let callbacks = &*self.0.lock().unwrap();
+        for callback in callbacks {
+            if let Err(e) = callback.onDied(cid as i32) {
+                error!("Error calling callback: {}", e);
+            }
+        }
+    }
+
+    /// Add a new callback to the set.
+    fn add(&self, callback: Strong<dyn IVirtualMachineCallback>) {
+        self.0.lock().unwrap().push(callback);
+    }
 }

 /// The mutable state of the Virt Manager. There should only be one instance of this struct.
@ -175,7 +211,7 @@ struct State {
 }

 impl State {
-    /// Get a list of VMs which are currently running.
+    /// Get a list of VMs which still have Binder references to them.
    fn vms(&self) -> Vec<Arc<VmInstance>> {
        // Attempt to upgrade the weak pointers to strong pointers.
        self.vms.iter().filter_map(Weak::upgrade).collect()
@ -200,6 +236,14 @@ impl State {
        let pos = self.debug_held_vms.iter().position(|vm| vm.getCid() == Ok(cid))?;
        Some(self.debug_held_vms.swap_remove(pos))
    }
+
+    /// Get the next available CID, or an error if we have run out.
+    fn allocate_cid(&mut self) -> binder::Result<Cid> {
+        // TODO(qwandor): keep track of which CIDs are currently in use so that we can reuse them.
+        let cid = self.next_cid;
+        self.next_cid = self.next_cid.checked_add(1).ok_or(StatusCode::UNKNOWN_ERROR)?;
+        Ok(cid)
+    }
 }

 impl Default for State {
@ -217,7 +261,7 @@ fn start_vm(
    requester_uid: u32,
    requester_sid: Option<String>,
    requester_pid: i32,
-) -> binder::Result<VmInstance> {
+) -> binder::Result<Arc<VmInstance>> {
    let config = VmConfig::load(config_file).map_err(|e| {
        error!("Failed to load VM config from {:?}: {:?}", config_file, e);
        StatusCode::BAD_VALUE
--- a/virtmanager/src/crosvm.rs
+++ b/virtmanager/src/crosvm.rs
@ -14,12 +14,17 @@

 //! Functions for running instances of `crosvm`.

+use crate::aidl::VirtualMachineCallbacks;
 use crate::config::VmConfig;
 use crate::Cid;
 use anyhow::Error;
-use log::{debug, error, info};
+use log::{error, info};
+use shared_child::SharedChild;
 use std::fs::File;
-use std::process::{Child, Command};
+use std::process::Command;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::thread;

 const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";

@ -27,7 +32,7 @@ const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
 #[derive(Debug)]
 pub struct VmInstance {
    /// The crosvm child process.
-    child: Child,
+    child: SharedChild,
    /// The CID assigned to the VM for vsock communication.
    pub cid: Cid,
    /// The UID of the process which requested the VM.
@ -37,18 +42,30 @@ pub struct VmInstance {
    /// The PID of the process which requested the VM. Note that this process may no longer exist
    /// and the PID may have been reused for a different process, so this should not be trusted.
    pub requester_pid: i32,
+    /// Whether the VM is still running.
+    running: AtomicBool,
+    /// Callbacks to clients of the VM.
+    pub callbacks: VirtualMachineCallbacks,
 }

 impl VmInstance {
    /// Create a new `VmInstance` for the given process.
    fn new(
-        child: Child,
+        child: SharedChild,
        cid: Cid,
        requester_uid: u32,
        requester_sid: Option<String>,
        requester_pid: i32,
    ) -> VmInstance {
-        VmInstance { child, cid, requester_uid, requester_sid, requester_pid }
+        VmInstance {
+            child,
+            cid,
+            requester_uid,
+            requester_sid,
+            requester_pid,
+            running: AtomicBool::new(true),
+            callbacks: Default::default(),
+        }
    }

    /// Start an instance of `crosvm` to manage a new VM. The `crosvm` instance will be killed when
@ -60,29 +77,46 @@ impl VmInstance {
        requester_uid: u32,
        requester_sid: Option<String>,
        requester_pid: i32,
-    ) -> Result<VmInstance, Error> {
+    ) -> Result<Arc<VmInstance>, Error> {
        let child = run_vm(config, cid, log_fd)?;
-        Ok(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid))
-    }
-}
+        let instance =
+            Arc::new(VmInstance::new(child, cid, requester_uid, requester_sid, requester_pid));

-impl Drop for VmInstance {
-    fn drop(&mut self) {
-        debug!("Dropping {:?}", self);
+        let instance_clone = instance.clone();
+        thread::spawn(move || {
+            instance_clone.monitor();
+        });
+
+        Ok(instance)
+    }
+
+    /// Wait for the crosvm child process to finish, then mark the VM as no longer running and call
+    /// any callbacks.
+    fn monitor(&self) {
+        match self.child.wait() {
+            Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
+            Ok(status) => info!("crosvm exited with status {}", status),
+        }
+        self.running.store(false, Ordering::Release);
+        self.callbacks.callback_on_died(self.cid);
+    }
+
+    /// Return whether `crosvm` is still running the VM.
+    pub fn running(&self) -> bool {
+        self.running.load(Ordering::Acquire)
+    }
+
+    /// Kill the crosvm instance.
+    pub fn kill(&self) {
        // TODO: Talk to crosvm to shutdown cleanly.
        if let Err(e) = self.child.kill() {
            error!("Error killing crosvm instance: {}", e);
        }
-        // We need to wait on the process after killing it to avoid zombies.
-        match self.child.wait() {
-            Err(e) => error!("Error waiting for crosvm instance to die: {}", e),
-            Ok(status) => info!("Crosvm exited with status {}", status),
-        }
    }
 }

 /// Start an instance of `crosvm` to manage a new VM.
-fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<Child, Error> {
+fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<SharedChild, Error> {
    config.validate()?;

    let mut command = Command::new(CROSVM_PATH);
@ -110,6 +144,5 @@ fn run_vm(config: &VmConfig, cid: Cid, log_fd: Option<File>) -> Result<Child, Er
        command.arg(kernel);
    }
    info!("Running {:?}", command);
-    // TODO: Monitor child process, and remove from VM map if it dies.
-    Ok(command.spawn()?)
+    Ok(SharedChild::spawn(&mut command)?)
 }
--- a/vm/src/main.rs
+++ b/vm/src/main.rs
@ -17,9 +17,14 @@
 mod sync;

 use android_system_virtmanager::aidl::android::system::virtmanager::IVirtManager::IVirtManager;
+use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachine::IVirtualMachine;
+use android_system_virtmanager::aidl::android::system::virtmanager::IVirtualMachineCallback::{
+    BnVirtualMachineCallback, IVirtualMachineCallback,
+};
 use android_system_virtmanager::binder::{
    get_interface, DeathRecipient, IBinder, ParcelFileDescriptor, ProcessState, Strong,
 };
+use android_system_virtmanager::binder::{Interface, Result as BinderResult};
 use anyhow::{Context, Error};
 use std::fs::File;
 use std::io;
@ -91,14 +96,26 @@ fn command_run(
        // Pass the VM reference back to Virt Manager and have it hold it in the background.
        virt_manager.debugHoldVmRef(&vm).context("Failed to pass VM to Virt Manager")
    } else {
-        // Wait until the VM dies. If we just returned immediately then the IVirtualMachine Binder
-        // object would be dropped and the VM would be killed.
-        wait_for_death(&mut vm.as_binder())?;
-        println!("VM died");
-        Ok(())
+        // Wait until the VM or VirtManager dies. If we just returned immediately then the
+        // IVirtualMachine Binder object would be dropped and the VM would be killed.
+        wait_for_vm(vm)
    }
 }

+/// Wait until the given VM or the VirtManager itself dies.
+fn wait_for_vm(vm: Strong<dyn IVirtualMachine>) -> Result<(), Error> {
+    let dead = AtomicFlag::default();
+    let callback =
+        BnVirtualMachineCallback::new_binder(VirtualMachineCallback { dead: dead.clone() });
+    vm.registerCallback(&callback)?;
+    let death_recipient = wait_for_death(&mut vm.as_binder(), dead.clone())?;
+    dead.wait();
+    // Ensure that death_recipient isn't dropped before we wait on the flag, as it is removed
+    // from the Binder when it's dropped.
+    drop(death_recipient);
+    Ok(())
+}
+
 /// Retrieve reference to a previously daemonized VM and stop it.
 fn command_stop(virt_manager: Strong<dyn IVirtManager>, cid: u32) -> Result<(), Error> {
    virt_manager
@ -115,18 +132,31 @@ fn command_list(virt_manager: Strong<dyn IVirtManager>) -> Result<(), Error> {
    Ok(())
 }

-/// Block until the given Binder object dies.
-fn wait_for_death(binder: &mut impl IBinder) -> Result<(), Error> {
-    let dead = AtomicFlag::default();
-    let mut death_recipient = {
-        let dead = dead.clone();
-        DeathRecipient::new(move || {
+/// Raise the given flag when the given Binder object dies.
+///
+/// If the returned DeathRecipient is dropped then this will no longer do anything.
+fn wait_for_death(binder: &mut impl IBinder, dead: AtomicFlag) -> Result<DeathRecipient, Error> {
+    let mut death_recipient = DeathRecipient::new(move || {
+        println!("VirtManager died");
        dead.raise();
-        })
-    };
+    });
    binder.link_to_death(&mut death_recipient)?;
-    dead.wait();
+    Ok(death_recipient)
+}
+
+#[derive(Debug)]
+struct VirtualMachineCallback {
+    dead: AtomicFlag,
+}
+
+impl Interface for VirtualMachineCallback {}
+
+impl IVirtualMachineCallback for VirtualMachineCallback {
+    fn onDied(&self, _cid: i32) -> BinderResult<()> {
+        println!("VM died");
+        self.dead.raise();
        Ok(())
+    }
 }

 /// Safely duplicate the standard output file descriptor.