diff --git a/benchmarks/Android.bp b/benchmarks/Android.bp index 12f494062..3f95aa1f0 100644 --- a/benchmarks/Android.bp +++ b/benchmarks/Android.bp @@ -25,6 +25,7 @@ cc_defaults { "-Wunused", ], srcs: [ + "atomic_benchmark.cpp", "math_benchmark.cpp", "property_benchmark.cpp", "pthread_benchmark.cpp", diff --git a/benchmarks/atomic_benchmark.cpp b/benchmarks/atomic_benchmark.cpp new file mode 100644 index 000000000..66a0120a1 --- /dev/null +++ b/benchmarks/atomic_benchmark.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Our goal is to measure the cost of various C++ atomic operations. +// Android doesn't really control those. But since some of these operations can be quite +// expensive, this may be useful input for development of higher level code. +// Expected mappings from C++ atomics to hardware primitives can be found at +// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html . + +#include +#include +#include + +// We time atomic operations separated by a volatile (not atomic!) increment. This ensures +// that the compiler emits memory instructions (e.g. load or store) prior to any fence or the +// like. That in turn ensures that the CPU has outstanding memory operations when the fence +// is executed. + +// In most respects, we compute best case values. Since there is only one thread, there are no +// coherence misses. + +// We assume that the compiler is not smart enough to optimize away fences in a single-threaded +// program. If that changes, we'll need to add a second thread. + +volatile unsigned counter; + +std::atomic test_loc(0); + +volatile unsigned sink; + +std::mutex mtx; + +void BM_empty(benchmark::State& state) { + while (state.KeepRunning()) { + ++counter; + } +} +BENCHMARK(BM_empty); + +static void BM_load_relaxed(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.load(std::memory_order_relaxed); + ++counter; + } + sink = result; +} +BENCHMARK(BM_load_relaxed); + +static void BM_load_acquire(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.load(std::memory_order_acquire); + ++counter; + } + sink = result; +} +BENCHMARK(BM_load_acquire); + +static void BM_store_release(benchmark::State& state) { + int i = counter; + while (state.KeepRunning()) { + test_loc.store(++i, std::memory_order_release); + ++counter; + } +} +BENCHMARK(BM_store_release); + +static void BM_store_seq_cst(benchmark::State& state) { + int i = counter; + while (state.KeepRunning()) { + test_loc.store(++i, std::memory_order_seq_cst); + ++counter; + } +} +BENCHMARK(BM_store_seq_cst); + +static void BM_fetch_add_relaxed(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.fetch_add(1, std::memory_order_relaxed); + ++counter; + } + sink = result; +} +BENCHMARK(BM_fetch_add_relaxed); + +static void BM_fetch_add_seq_cst(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.fetch_add(1, std::memory_order_seq_cst); + ++counter; + } + sink = result; +} +BENCHMARK(BM_fetch_add_seq_cst); + +// The fence benchmarks include a relaxed load to make it much harder to optimize away +// the fence. + +static void BM_acquire_fence(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.load(std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_acquire); + ++counter; + } + sink = result; +} +BENCHMARK(BM_acquire_fence); + +static void BM_seq_cst_fence(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + result += test_loc.load(std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + ++counter; + } + sink = result; +} +BENCHMARK(BM_seq_cst_fence); + +// For comparison, also throw in a critical section version: + +static void BM_fetch_add_cs(benchmark::State& state) { + unsigned result = 0; + while (state.KeepRunning()) { + { + std::lock_guard _(mtx); + result += ++counter; + } + } + sink = result; +} +BENCHMARK(BM_fetch_add_cs);