Replace cxa_guard fences with acquire loads
This seemed to be the only place in bionic where a fence on a performance-critical path could be easily replaced by a stronger load/store order constraint. Do so. On x86 this should generate the same code either way. Based on microbenchmarks of the relevant ARM instructions, this is currently performance-neutral in this kind of context. But in the future, the newly generated acquire loads should give us a performance benefit. Test: Booted AOSP Change-Id: I7823e11d6ae4fd58e0425244c293262e2320fd81
This commit is contained in:
parent
c9a840ac76
commit
963daed780
|
@ -79,38 +79,33 @@ union _guard_t {
|
|||
#define CONSTRUCTION_UNDERWAY_WITH_WAITER 0x200
|
||||
|
||||
extern "C" int __cxa_guard_acquire(_guard_t* gv) {
|
||||
int old_value = atomic_load_explicit(&gv->state, memory_order_relaxed);
|
||||
int old_value = atomic_load_explicit(&gv->state, memory_order_acquire);
|
||||
// In the common CONSTRUCTION_COMPLETE case we have to ensure that all the stores performed by
|
||||
// the construction function are observable on this CPU after we exit. A similar constraint may
|
||||
// apply in the CONSTRUCTION_NOT_YET_STARTED case with a prior abort.
|
||||
|
||||
while (true) {
|
||||
if (old_value == CONSTRUCTION_COMPLETE) {
|
||||
// A load_acquire operation is need before exiting with COMPLETE state, as we have to ensure
|
||||
// that all the stores performed by the construction function are observable on this CPU
|
||||
// after we exit.
|
||||
atomic_thread_fence(memory_order_acquire);
|
||||
return 0;
|
||||
} else if (old_value == CONSTRUCTION_NOT_YET_STARTED) {
|
||||
if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value,
|
||||
CONSTRUCTION_UNDERWAY_WITHOUT_WAITER,
|
||||
memory_order_relaxed,
|
||||
memory_order_relaxed)) {
|
||||
memory_order_acquire /* or relaxed in C++17 */,
|
||||
memory_order_acquire)) {
|
||||
continue;
|
||||
}
|
||||
// The acquire fence may not be needed. But as described in section 3.3.2 of
|
||||
// the Itanium C++ ABI specification, it probably has to behave like the
|
||||
// acquisition of a mutex, which needs an acquire fence.
|
||||
atomic_thread_fence(memory_order_acquire);
|
||||
return 1;
|
||||
} else if (old_value == CONSTRUCTION_UNDERWAY_WITHOUT_WAITER) {
|
||||
if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value,
|
||||
CONSTRUCTION_UNDERWAY_WITH_WAITER,
|
||||
memory_order_relaxed,
|
||||
memory_order_relaxed)) {
|
||||
memory_order_acquire /* or relaxed in C++17 */,
|
||||
memory_order_acquire)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
__futex_wait_ex(&gv->state, false, CONSTRUCTION_UNDERWAY_WITH_WAITER, false, nullptr);
|
||||
old_value = atomic_load_explicit(&gv->state, memory_order_relaxed);
|
||||
old_value = atomic_load_explicit(&gv->state, memory_order_acquire);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue