• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <cstdint>
16 #include <mutex>  // NOLINT(build/c++11)
17 #include <vector>
18 
19 #include "absl/base/config.h"
20 #include "absl/base/internal/cycleclock.h"
21 #include "absl/base/internal/spinlock.h"
22 #include "absl/synchronization/blocking_counter.h"
23 #include "absl/synchronization/internal/thread_pool.h"
24 #include "absl/synchronization/mutex.h"
25 #include "benchmark/benchmark.h"
26 
27 namespace {
28 
BM_Mutex(benchmark::State & state)29 void BM_Mutex(benchmark::State& state) {
30   static absl::Mutex* mu = new absl::Mutex;
31   for (auto _ : state) {
32     absl::MutexLock lock(mu);
33   }
34 }
35 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
36 
BM_ReaderLock(benchmark::State & state)37 void BM_ReaderLock(benchmark::State& state) {
38   static absl::Mutex* mu = new absl::Mutex;
39   for (auto _ : state) {
40     absl::ReaderMutexLock lock(mu);
41   }
42 }
43 BENCHMARK(BM_ReaderLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
44 
BM_TryLock(benchmark::State & state)45 void BM_TryLock(benchmark::State& state) {
46   absl::Mutex mu;
47   for (auto _ : state) {
48     if (mu.TryLock()) {
49       mu.Unlock();
50     }
51   }
52 }
53 BENCHMARK(BM_TryLock);
54 
BM_ReaderTryLock(benchmark::State & state)55 void BM_ReaderTryLock(benchmark::State& state) {
56   static absl::Mutex* mu = new absl::Mutex;
57   for (auto _ : state) {
58     if (mu->ReaderTryLock()) {
59       mu->ReaderUnlock();
60     }
61   }
62 }
63 BENCHMARK(BM_ReaderTryLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
64 
DelayNs(int64_t ns,int * data)65 static void DelayNs(int64_t ns, int* data) {
66   int64_t end = absl::base_internal::CycleClock::Now() +
67                 ns * absl::base_internal::CycleClock::Frequency() / 1e9;
68   while (absl::base_internal::CycleClock::Now() < end) {
69     ++(*data);
70     benchmark::DoNotOptimize(*data);
71   }
72 }
73 
74 template <typename MutexType>
75 class RaiiLocker {
76  public:
RaiiLocker(MutexType * mu)77   explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
~RaiiLocker()78   ~RaiiLocker() { mu_->Unlock(); }
79  private:
80   MutexType* mu_;
81 };
82 
83 template <>
84 class RaiiLocker<std::mutex> {
85  public:
RaiiLocker(std::mutex * mu)86   explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
~RaiiLocker()87   ~RaiiLocker() { mu_->unlock(); }
88  private:
89   std::mutex* mu_;
90 };
91 
92 // RAII object to change the Mutex priority of the running thread.
93 class ScopedThreadMutexPriority {
94  public:
ScopedThreadMutexPriority(int priority)95   explicit ScopedThreadMutexPriority(int priority) {
96     absl::base_internal::ThreadIdentity* identity =
97         absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
98     identity->per_thread_synch.priority = priority;
99     // Bump next_priority_read_cycles to the infinite future so that the
100     // implementation doesn't re-read the thread's actual scheduler priority
101     // and replace our temporary scoped priority.
102     identity->per_thread_synch.next_priority_read_cycles =
103         std::numeric_limits<int64_t>::max();
104   }
~ScopedThreadMutexPriority()105   ~ScopedThreadMutexPriority() {
106     // Reset the "next priority read time" back to the infinite past so that
107     // the next time the Mutex implementation wants to know this thread's
108     // priority, it re-reads it from the OS instead of using our overridden
109     // priority.
110     absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
111         ->per_thread_synch.next_priority_read_cycles =
112         std::numeric_limits<int64_t>::min();
113   }
114 };
115 
BM_MutexEnqueue(benchmark::State & state)116 void BM_MutexEnqueue(benchmark::State& state) {
117   // In the "multiple priorities" variant of the benchmark, one of the
118   // threads runs with Mutex priority 0 while the rest run at elevated priority.
119   // This benchmarks the performance impact of the presence of a low priority
120   // waiter when a higher priority waiter adds itself of the queue
121   // (b/175224064).
122   //
123   // NOTE: The actual scheduler priority is not modified in this benchmark:
124   // all of the threads get CPU slices with the same priority. Only the
125   // Mutex queueing behavior is modified.
126   const bool multiple_priorities = state.range(0);
127   ScopedThreadMutexPriority priority_setter(
128       (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
129 
130   struct Shared {
131     absl::Mutex mu;
132     std::atomic<int> looping_threads{0};
133     std::atomic<int> blocked_threads{0};
134     std::atomic<bool> thread_has_mutex{false};
135   };
136   static Shared* shared = new Shared;
137 
138   // Set up 'blocked_threads' to count how many threads are currently blocked
139   // in Abseil synchronization code.
140   //
141   // NOTE: Blocking done within the Google Benchmark library itself (e.g.
142   // the barrier which synchronizes threads entering and exiting the benchmark
143   // loop) does _not_ get registered in this counter. This is because Google
144   // Benchmark uses its own synchronization primitives based on std::mutex, not
145   // Abseil synchronization primitives. If at some point the benchmark library
146   // merges into Abseil, this code may break.
147   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
148       &shared->blocked_threads);
149 
150   // The benchmark framework may run several iterations in the same process,
151   // reusing the same static-initialized 'shared' object. Given the semantics
152   // of the members, here, we expect everything to be reset to zero by the
153   // end of any iteration. Assert that's the case, just to be sure.
154   ABSL_RAW_CHECK(
155       shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
156           shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
157           !shared->thread_has_mutex.load(std::memory_order_relaxed),
158       "Shared state isn't zeroed at start of benchmark iteration");
159 
160   static constexpr int kBatchSize = 1000;
161   while (state.KeepRunningBatch(kBatchSize)) {
162     shared->looping_threads.fetch_add(1);
163     for (int i = 0; i < kBatchSize; i++) {
164       {
165         absl::MutexLock l(&shared->mu);
166         shared->thread_has_mutex.store(true, std::memory_order_relaxed);
167         // Spin until all other threads are either out of the benchmark loop
168         // or blocked on the mutex. This ensures that the mutex queue is kept
169         // at its maximal length to benchmark the performance of queueing on
170         // a highly contended mutex.
171         while (shared->looping_threads.load(std::memory_order_relaxed) -
172                    shared->blocked_threads.load(std::memory_order_relaxed) !=
173                1) {
174         }
175         shared->thread_has_mutex.store(false);
176       }
177       // Spin until some other thread has acquired the mutex before we block
178       // again. This ensures that we always go through the slow (queueing)
179       // acquisition path rather than reacquiring the mutex we just released.
180       while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
181              shared->looping_threads.load(std::memory_order_relaxed) > 1) {
182       }
183     }
184     // The benchmark framework uses a barrier to ensure that all of the threads
185     // complete their benchmark loop together before any of the threads exit
186     // the loop. So, we need to remove ourselves from the "looping threads"
187     // counter here before potentially blocking on that barrier. Otherwise,
188     // another thread spinning above might wait forever for this thread to
189     // block on the mutex while we in fact are waiting to exit.
190     shared->looping_threads.fetch_add(-1);
191   }
192   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
193       nullptr);
194 }
195 
196 BENCHMARK(BM_MutexEnqueue)
197     ->Threads(4)
198     ->Threads(64)
199     ->Threads(128)
200     ->Threads(512)
201     ->ArgName("multiple_priorities")
202     ->Arg(false)
203     ->Arg(true);
204 
205 template <typename MutexType>
BM_Contended(benchmark::State & state)206 void BM_Contended(benchmark::State& state) {
207   int priority = state.thread_index() % state.range(1);
208   ScopedThreadMutexPriority priority_setter(priority);
209 
210   struct Shared {
211     MutexType mu;
212     int data = 0;
213   };
214   static auto* shared = new Shared;
215   int local = 0;
216   for (auto _ : state) {
217     // Here we model both local work outside of the critical section as well as
218     // some work inside of the critical section. The idea is to capture some
219     // more or less realisitic contention levels.
220     // If contention is too low, the benchmark won't measure anything useful.
221     // If contention is unrealistically high, the benchmark will favor
222     // bad mutex implementations that block and otherwise distract threads
223     // from the mutex and shared state for as much as possible.
224     // To achieve this amount of local work is multiplied by number of threads
225     // to keep ratio between local work and critical section approximately
226     // equal regardless of number of threads.
227     DelayNs(100 * state.threads(), &local);
228     RaiiLocker<MutexType> locker(&shared->mu);
229     DelayNs(state.range(0), &shared->data);
230   }
231 }
SetupBenchmarkArgs(benchmark::internal::Benchmark * bm,bool do_test_priorities)232 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
233                         bool do_test_priorities) {
234   const int max_num_priorities = do_test_priorities ? 2 : 1;
235   bm->UseRealTime()
236       // ThreadPerCpu poorly handles non-power-of-two CPU counts.
237       ->Threads(1)
238       ->Threads(2)
239       ->Threads(4)
240       ->Threads(6)
241       ->Threads(8)
242       ->Threads(12)
243       ->Threads(16)
244       ->Threads(24)
245       ->Threads(32)
246       ->Threads(48)
247       ->Threads(64)
248       ->Threads(96)
249       ->Threads(128)
250       ->Threads(192)
251       ->Threads(256)
252       ->ArgNames({"cs_ns", "num_prios"});
253   // Some empirically chosen amounts of work in critical section.
254   // 1 is low contention, 2000 is high contention and few values in between.
255   for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
256     for (int num_priorities = 1; num_priorities <= max_num_priorities;
257          num_priorities++) {
258       bm->ArgPair(critical_section_ns, num_priorities);
259     }
260   }
261 }
262 
263 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
__anon46b70b0a0202(benchmark::internal::Benchmark* bm) 264     ->Apply([](benchmark::internal::Benchmark* bm) {
265       SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
266     });
267 
268 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
__anon46b70b0a0302(benchmark::internal::Benchmark* bm) 269     ->Apply([](benchmark::internal::Benchmark* bm) {
270       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
271     });
272 
273 BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
__anon46b70b0a0402(benchmark::internal::Benchmark* bm) 274     ->Apply([](benchmark::internal::Benchmark* bm) {
275       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
276     });
277 
278 // Measure the overhead of conditions on mutex release (when they must be
279 // evaluated).  Mutex has (some) support for equivalence classes allowing
280 // Conditions with the same function/argument to potentially not be multiply
281 // evaluated.
282 //
283 // num_classes==0 is used for the special case of every waiter being distinct.
BM_ConditionWaiters(benchmark::State & state)284 void BM_ConditionWaiters(benchmark::State& state) {
285   int num_classes = state.range(0);
286   int num_waiters = state.range(1);
287 
288   struct Helper {
289     static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
290       init->DecrementCount();
291       m->LockWhen(absl::Condition(
292           static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
293       m->Unlock();
294     }
295   };
296 
297   if (num_classes == 0) {
298     // No equivalence classes.
299     num_classes = num_waiters;
300   }
301 
302   absl::BlockingCounter init(num_waiters);
303   absl::Mutex mu;
304   std::vector<int> equivalence_classes(num_classes, 1);
305 
306   // Must be declared last to be destroyed first.
307   absl::synchronization_internal::ThreadPool pool(num_waiters);
308 
309   for (int i = 0; i < num_waiters; i++) {
310     // Mutex considers Conditions with the same function and argument
311     // to be equivalent.
312     pool.Schedule([&, i] {
313       Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
314     });
315   }
316   init.Wait();
317 
318   for (auto _ : state) {
319     mu.Lock();
320     mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.
321   }
322 
323   mu.Lock();
324   for (int i = 0; i < num_classes; i++) {
325     equivalence_classes[i] = 0;
326   }
327   mu.Unlock();
328 }
329 
330 // Some configurations have higher thread limits than others.
331 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
332 constexpr int kMaxConditionWaiters = 8192;
333 #else
334 constexpr int kMaxConditionWaiters = 1024;
335 #endif
336 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
337 
338 }  // namespace
339