From 31407917f8e6c36d8f930a3136fe8a3dd00e4fc9 Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Thu, 14 Mar 2024 14:18:47 +0100 Subject: [PATCH] [MTE] Implement permissive (recoverable) MTE for apps Extends the recoverable native crash handling support to also allow for MTE crashes to be recovered from in apps. Backs onto the existing GWP-ASan recoverable handling. At least for system/core, I've renamed the (now) generic "recoverable crash" variables to remove the notion of GWP-ASan. Permissive MTE should no longer crash an app, and crashes are still visible in the AppExitInfo API and tombstones. Test: atest CtsTaggingHostTestCases Bug: 328793166 Change-Id: I4c6ffa85af0e0d9b72d0ccd606bb6e1ca464cfff --- debuggerd/crash_dump.cpp | 24 +++---- debuggerd/handler/debuggerd_handler.cpp | 95 ++++++++++++++++--------- debuggerd/include/debuggerd/handler.h | 2 +- debuggerd/protocol.h | 2 +- 4 files changed, 73 insertions(+), 50 deletions(-) diff --git a/debuggerd/crash_dump.cpp b/debuggerd/crash_dump.cpp index a23a26925..203b4855b 100644 --- a/debuggerd/crash_dump.cpp +++ b/debuggerd/crash_dump.cpp @@ -143,7 +143,7 @@ static bool ptrace_interrupt(pid_t tid, int* received_signal) { } static bool activity_manager_notify(pid_t pid, int signal, const std::string& amfd_data, - bool recoverable_gwp_asan_crash) { + bool recoverable_crash) { ATRACE_CALL(); android::base::unique_fd amfd(socket_local_client( "/data/system/ndebugsocket", ANDROID_SOCKET_NAMESPACE_FILESYSTEM, SOCK_STREAM)); @@ -169,7 +169,7 @@ static bool activity_manager_notify(pid_t pid, int signal, const std::string& am // Activity Manager protocol: // - 32-bit network-byte-order: pid // - 32-bit network-byte-order: signal number - // - byte: recoverable_gwp_asan_crash + // - byte: recoverable_crash // - bytes: raw text of the dump // - null terminator @@ -185,10 +185,9 @@ static bool activity_manager_notify(pid_t pid, int signal, const std::string& am return false; } - uint8_t recoverable_gwp_asan_crash_byte = recoverable_gwp_asan_crash ? 1 : 0; - if (!android::base::WriteFully(amfd, &recoverable_gwp_asan_crash_byte, - sizeof(recoverable_gwp_asan_crash_byte))) { - PLOG(ERROR) << "AM recoverable_gwp_asan_crash_byte write failed"; + uint8_t recoverable_crash_byte = recoverable_crash ? 1 : 0; + if (!android::base::WriteFully(amfd, &recoverable_crash_byte, sizeof(recoverable_crash_byte))) { + PLOG(ERROR) << "AM recoverable_crash_byte write failed"; return false; } @@ -280,11 +279,11 @@ static void ParseArgs(int argc, char** argv, pid_t* pseudothread_tid, DebuggerdD static void ReadCrashInfo(unique_fd& fd, siginfo_t* siginfo, std::unique_ptr* regs, ProcessInfo* process_info, - bool* recoverable_gwp_asan_crash) { + bool* recoverable_crash) { std::aligned_storage::type buf; CrashInfo* crash_info = reinterpret_cast(&buf); ssize_t rc = TEMP_FAILURE_RETRY(read(fd.get(), &buf, sizeof(buf))); - *recoverable_gwp_asan_crash = false; + *recoverable_crash = false; if (rc == -1) { PLOG(FATAL) << "failed to read target ucontext"; } else { @@ -321,7 +320,7 @@ static void ReadCrashInfo(unique_fd& fd, siginfo_t* siginfo, process_info->scudo_region_info = crash_info->data.d.scudo_region_info; process_info->scudo_ring_buffer = crash_info->data.d.scudo_ring_buffer; process_info->scudo_ring_buffer_size = crash_info->data.d.scudo_ring_buffer_size; - *recoverable_gwp_asan_crash = crash_info->data.d.recoverable_gwp_asan_crash; + *recoverable_crash = crash_info->data.d.recoverable_crash; process_info->crash_detail_page = crash_info->data.d.crash_detail_page; FALLTHROUGH_INTENDED; case 1: @@ -487,7 +486,7 @@ int main(int argc, char** argv) { std::map thread_info; siginfo_t siginfo; std::string error; - bool recoverable_gwp_asan_crash = false; + bool recoverable_crash = false; { ATRACE_NAME("ptrace"); @@ -539,8 +538,7 @@ int main(int argc, char** argv) { if (thread == g_target_thread) { // Read the thread's registers along with the rest of the crash info out of the pipe. - ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info, - &recoverable_gwp_asan_crash); + ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info, &recoverable_crash); info.siginfo = &siginfo; info.signo = info.siginfo->si_signo; @@ -670,7 +668,7 @@ int main(int argc, char** argv) { if (fatal_signal) { // Don't try to notify ActivityManager if it just crashed, or we might hang until timeout. if (thread_info[target_process].thread_name != "system_server") { - activity_manager_notify(target_process, signo, amfd_data, recoverable_gwp_asan_crash); + activity_manager_notify(target_process, signo, amfd_data, recoverable_crash); } } diff --git a/debuggerd/handler/debuggerd_handler.cpp b/debuggerd/handler/debuggerd_handler.cpp index 141723b03..ba5ad3f36 100644 --- a/debuggerd/handler/debuggerd_handler.cpp +++ b/debuggerd/handler/debuggerd_handler.cpp @@ -396,7 +396,7 @@ static int debuggerd_dispatch_pseudothread(void* arg) { ASSERT_SAME_OFFSET(scudo_ring_buffer, scudo_ring_buffer); ASSERT_SAME_OFFSET(scudo_ring_buffer_size, scudo_ring_buffer_size); ASSERT_SAME_OFFSET(scudo_stack_depot_size, scudo_stack_depot_size); - ASSERT_SAME_OFFSET(recoverable_gwp_asan_crash, recoverable_gwp_asan_crash); + ASSERT_SAME_OFFSET(recoverable_crash, recoverable_crash); ASSERT_SAME_OFFSET(crash_detail_page, crash_detail_page); #undef ASSERT_SAME_OFFSET @@ -573,6 +573,7 @@ static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* c } gwp_asan_callbacks_t gwp_asan_callbacks = {}; + bool recoverable_gwp_asan_crash = false; if (g_callbacks.get_gwp_asan_callbacks != nullptr) { // GWP-ASan catches use-after-free and heap-buffer-overflow by using PROT_NONE // guard pages, which lead to SEGV. Normally, debuggerd prints a bug report @@ -587,10 +588,30 @@ static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* c gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report && gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) { gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr); - process_info.recoverable_gwp_asan_crash = true; + recoverable_gwp_asan_crash = true; + process_info.recoverable_crash = true; } } + if (info->si_signo == SIGSEGV && + (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) && is_permissive_mte()) { + process_info.recoverable_crash = true; + // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread, + // and then let the failing instruction be retried. The second time should work (except + // if there is another non-MTE fault). + int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0); + if (tagged_addr_ctrl < 0) { + fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL"); + } + tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE; + if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) { + fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL"); + } + async_safe_format_log(ANDROID_LOG_ERROR, "libc", + "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING."); + pthread_mutex_unlock(&crash_mutex); + } + // If sival_int is ~0, it means that the fallback handler has been called // once before and this function is being called again to dump the stack // of a specific thread. It is possible that the prctl call might return 1, @@ -602,7 +623,7 @@ static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* c // you can only set NO_NEW_PRIVS to 1, and the effect should be at worst a single missing // ANR trace. debuggerd_fallback_handler(info, ucontext, process_info.abort_msg); - if (no_new_privs && process_info.recoverable_gwp_asan_crash) { + if (no_new_privs && recoverable_gwp_asan_crash) { gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr); return; } @@ -679,29 +700,14 @@ static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* c // If the signal is fatal, don't unlock the mutex to prevent other crashing threads from // starting to dump right before our death. pthread_mutex_unlock(&crash_mutex); - } else if (process_info.recoverable_gwp_asan_crash) { - gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr); + } else if (process_info.recoverable_crash) { + if (recoverable_gwp_asan_crash) { + gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr); + } pthread_mutex_unlock(&crash_mutex); } #ifdef __aarch64__ - else if (info->si_signo == SIGSEGV && - (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) && - is_permissive_mte()) { - // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread, - // and then let the failing instruction be retried. The second time should work (except - // if there is another non-MTE fault). - int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0); - if (tagged_addr_ctrl < 0) { - fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL"); - } - tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE; - if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) { - fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL"); - } - async_safe_format_log(ANDROID_LOG_ERROR, "libc", - "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING."); - pthread_mutex_unlock(&crash_mutex); - } else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) { + else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) { // Back channel to init (see system/core/init/service.cpp) to signal that // this process crashed due to an ASYNC MTE fault and should be considered // for upgrade to SYNC mode. We are re-using the ART profiler signal, which @@ -761,18 +767,7 @@ void debuggerd_init(debuggerd_callbacks_t* callbacks) { debuggerd_register_handlers(&action); } -// When debuggerd's signal handler is the first handler called, it's great at -// handling the recoverable GWP-ASan mode. For apps, sigchain (from libart) is -// always the first signal handler, and so the following function is what -// sigchain must call before processing the signal. This allows for processing -// of a potentially recoverable GWP-ASan crash. If the signal requires GWP-ASan -// recovery, then dump a report (via the regular debuggerd hanndler), and patch -// up the allocator, and allow the process to continue (indicated by returning -// 'true'). If the crash has nothing to do with GWP-ASan, or recovery isn't -// possible, return 'false'. -bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) { - if (signal_number != SIGSEGV || !signal_has_si_addr(info)) return false; - +bool debuggerd_handle_gwp_asan_signal(int signal_number, siginfo_t* info, void* context) { if (g_callbacks.get_gwp_asan_callbacks == nullptr) return false; gwp_asan_callbacks_t gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks(); if (gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery == nullptr || @@ -810,3 +805,33 @@ bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) pthread_mutex_unlock(&first_crash_mutex); return true; } + +// When debuggerd's signal handler is the first handler called, it's great at +// handling the recoverable GWP-ASan and permissive MTE modes. For apps, +// sigchain (from libart) is always the first signal handler, and so the +// following function is what sigchain must call before processing the signal. +// This allows for processing of a potentially recoverable GWP-ASan or MTE +// crash. If the signal requires recovery, then dump a report (via the regular +// debuggerd hanndler), and patch up the allocator (in the case of GWP-ASan) or +// disable MTE on the thread, and allow the process to continue (indicated by +// returning 'true'). If the crash has nothing to do with GWP-ASan/MTE, or +// recovery isn't possible, return 'false'. +bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) { + if (signal_number != SIGSEGV) return false; + if (info->si_code == SEGV_MTEAERR || info->si_code == SEGV_MTESERR) { + if (!is_permissive_mte()) return false; + // Because permissive MTE disables MTE for the entire thread, we're less + // worried about getting a whole bunch of crashes in a row. ActivityManager + // doesn't like multiple native crashes for an app in a short period of time + // (see the comment about recoverable GWP-ASan in + // `debuggerd_handle_gwp_asan_signal`), but that shouldn't happen if MTE is + // disabled for the entire thread. This might need to be changed if there's + // some low-hanging bug that happens across multiple threads in quick + // succession. + debuggerd_signal_handler(signal_number, info, context); + return true; + } + + if (!signal_has_si_addr(info)) return false; + return debuggerd_handle_gwp_asan_signal(signal_number, info, context); +} diff --git a/debuggerd/include/debuggerd/handler.h b/debuggerd/include/debuggerd/handler.h index c18cf6e7a..954f049a6 100644 --- a/debuggerd/include/debuggerd/handler.h +++ b/debuggerd/include/debuggerd/handler.h @@ -47,7 +47,7 @@ struct __attribute__((packed)) debugger_process_info { const char* scudo_ring_buffer; size_t scudo_ring_buffer_size; size_t scudo_stack_depot_size; - bool recoverable_gwp_asan_crash; + bool recoverable_crash; struct crash_detail_page_t* crash_detail_page; }; diff --git a/debuggerd/protocol.h b/debuggerd/protocol.h index d3fc15edc..9af7377df 100644 --- a/debuggerd/protocol.h +++ b/debuggerd/protocol.h @@ -100,7 +100,7 @@ struct __attribute__((__packed__)) CrashInfoDataDynamic : public CrashInfoDataSt uintptr_t scudo_ring_buffer; size_t scudo_ring_buffer_size; size_t scudo_stack_depot_size; - bool recoverable_gwp_asan_crash; + bool recoverable_crash; uintptr_t crash_detail_page; };