Add "benchmark" to time atomic operations

The intent here is to confirm that the compiler behaves as expected, and to guide optimization efforts, particularly where there is a choice between primitives. Test: Built and ran benchmark on Angler repeatedly. Manually confirmed that the compiler behaves roughly as expected. Change-Id: I059b245d1ba8296e9b28602559b53eafafe0a30f
2017-01-23 17:30:44 -08:00 · 2017-01-23 17:30:44 -08:00 · 3f5578708d
parent 7844b4c5db
commit 3f5578708d
2 changed files with 149 additions and 0 deletions
--- a/benchmarks/Android.bp
+++ b/benchmarks/Android.bp
@ -25,6 +25,7 @@ cc_defaults {
        "-Wunused",
    ],
    srcs: [
        "atomic_benchmark.cpp",
        "math_benchmark.cpp",
        "property_benchmark.cpp",
        "pthread_benchmark.cpp",
--- a/benchmarks/atomic_benchmark.cpp
+++ b/benchmarks/atomic_benchmark.cpp
@ -0,0 +1,148 @@
 /*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // Our goal is to measure the cost of various C++ atomic operations.
 // Android doesn't really control those. But since some of these operations can be quite
 // expensive, this may be useful input for development of higher level code.
 // Expected mappings from C++ atomics to hardware primitives can be found at
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .
 #include <benchmark/benchmark.h>
 #include <atomic>
 #include <mutex>
 // We time atomic operations separated by a volatile (not atomic!) increment.  This ensures
 // that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
 // like.  That in turn ensures that the CPU has outstanding memory operations when the fence
 // is executed.
 // In most respects, we compute best case values. Since there is only one thread, there are no
 // coherence misses.
 // We assume that the compiler is not smart enough to optimize away fences in a single-threaded
 // program. If that changes, we'll need to add a second thread.
 volatile unsigned counter;
 std::atomic<int> test_loc(0);
 volatile unsigned sink;
 std::mutex mtx;
 void BM_empty(benchmark::State& state) {
  while (state.KeepRunning()) {
    ++counter;
  }
 }
 BENCHMARK(BM_empty);
 static void BM_load_relaxed(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_load_relaxed);
 static void BM_load_acquire(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_acquire);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_load_acquire);
 static void BM_store_release(benchmark::State& state) {
  int i = counter;
  while (state.KeepRunning()) {
    test_loc.store(++i, std::memory_order_release);
    ++counter;
  }
 }
 BENCHMARK(BM_store_release);
 static void BM_store_seq_cst(benchmark::State& state) {
  int i = counter;
  while (state.KeepRunning()) {
    test_loc.store(++i, std::memory_order_seq_cst);
    ++counter;
  }
 }
 BENCHMARK(BM_store_seq_cst);
 static void BM_fetch_add_relaxed(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.fetch_add(1, std::memory_order_relaxed);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_fetch_add_relaxed);
 static void BM_fetch_add_seq_cst(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.fetch_add(1, std::memory_order_seq_cst);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_fetch_add_seq_cst);
 // The fence benchmarks include a relaxed load to make it much harder to optimize away
 // the fence.
 static void BM_acquire_fence(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    std::atomic_thread_fence(std::memory_order_acquire);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_acquire_fence);
 static void BM_seq_cst_fence(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    result += test_loc.load(std::memory_order_relaxed);
    std::atomic_thread_fence(std::memory_order_seq_cst);
    ++counter;
  }
  sink = result;
 }
 BENCHMARK(BM_seq_cst_fence);
 // For comparison, also throw in a critical section version:
 static void BM_fetch_add_cs(benchmark::State& state) {
  unsigned result = 0;
  while (state.KeepRunning()) {
    {
      std::lock_guard<std::mutex> _(mtx);
      result += ++counter;
    }
  }
  sink = result;
 }
 BENCHMARK(BM_fetch_add_cs);