1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
7
8 #include <algorithm>
9 #include <atomic>
10
11 #include "base/allocator/partition_allocator/partition_alloc_base/compiler_specific.h"
12 #include "base/allocator/partition_allocator/partition_alloc_base/component_export.h"
13 #include "base/allocator/partition_allocator/partition_alloc_base/thread_annotations.h"
14 #include "base/allocator/partition_allocator/partition_alloc_check.h"
15 #include "base/allocator/partition_allocator/partition_alloc_config.h"
16 #include "base/allocator/partition_allocator/yield_processor.h"
17 #include "build/build_config.h"
18
19 #if BUILDFLAG(IS_WIN)
20 #include "base/allocator/partition_allocator/partition_alloc_base/win/windows_types.h"
21 #endif
22
23 #if BUILDFLAG(IS_POSIX)
24 #include <errno.h>
25 #include <pthread.h>
26 #endif
27
28 #if BUILDFLAG(IS_APPLE)
29 #include <os/lock.h>
30 #endif // BUILDFLAG(IS_APPLE)
31
32 #if BUILDFLAG(IS_FUCHSIA)
33 #include <lib/sync/mutex.h>
34 #endif
35
36 namespace partition_alloc::internal {
37
38 // The behavior of this class depends on whether PA_HAS_FAST_MUTEX is defined.
39 // 1. When it is defined:
40 //
41 // Simple spinning lock. It will spin in user space a set number of times before
42 // going into the kernel to sleep.
43 //
44 // This is intended to give "the best of both worlds" between a SpinLock and
45 // base::Lock:
46 // - SpinLock: Inlined fast path, no external function calls, just
47 // compare-and-swap. Short waits do not go into the kernel. Good behavior in
48 // low contention cases.
49 // - base::Lock: Good behavior in case of contention.
50 //
51 // We don't rely on base::Lock which we could make spin (by calling Try() in a
52 // loop), as performance is below a custom spinlock as seen on high-level
53 // benchmarks. Instead this implements a simple non-recursive mutex on top of
54 // the futex() syscall on Linux, SRWLock on Windows, os_unfair_lock on macOS,
55 // and pthread_mutex on POSIX. The main difference between this and a libc
56 // implementation is that it only supports the simplest path: private (to a
57 // process), non-recursive mutexes with no priority inheritance, no timed waits.
58 //
59 // As an interesting side-effect to be used in the allocator, this code does not
60 // make any allocations, locks are small with a constexpr constructor and no
61 // destructor.
62 //
63 // 2. Otherwise: This is a simple SpinLock, in the sense that it does not have
64 // any awareness of other threads' behavior.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)65 class PA_LOCKABLE PA_COMPONENT_EXPORT(PARTITION_ALLOC) SpinningMutex {
66 public:
67 inline constexpr SpinningMutex();
68 PA_ALWAYS_INLINE void Acquire() PA_EXCLUSIVE_LOCK_FUNCTION();
69 PA_ALWAYS_INLINE void Release() PA_UNLOCK_FUNCTION();
70 PA_ALWAYS_INLINE bool Try() PA_EXCLUSIVE_TRYLOCK_FUNCTION(true);
71 void AssertAcquired() const {} // Not supported.
72 void Reinit() PA_UNLOCK_FUNCTION();
73
74 private:
75 PA_NOINLINE void AcquireSpinThenBlock() PA_EXCLUSIVE_LOCK_FUNCTION();
76 #if PA_CONFIG(HAS_FAST_MUTEX)
77 void LockSlow() PA_EXCLUSIVE_LOCK_FUNCTION();
78 #else
79 PA_ALWAYS_INLINE void LockSlow() PA_EXCLUSIVE_LOCK_FUNCTION();
80 #endif
81
82 // See below, the latency of PA_YIELD_PROCESSOR can be as high as ~150
83 // cycles. Meanwhile, sleeping costs a few us. Spinning 64 times at 3GHz would
84 // cost 150 * 64 / 3e9 ~= 3.2us.
85 //
86 // This applies to Linux kernels, on x86_64. On ARM we might want to spin
87 // more.
88 static constexpr int kSpinCount = 64;
89
90 #if PA_CONFIG(HAS_FAST_MUTEX)
91
92 #if PA_CONFIG(HAS_LINUX_KERNEL)
93 void FutexWait();
94 void FutexWake();
95
96 static constexpr int kUnlocked = 0;
97 static constexpr int kLockedUncontended = 1;
98 static constexpr int kLockedContended = 2;
99
100 std::atomic<int32_t> state_{kUnlocked};
101 #elif BUILDFLAG(IS_WIN)
102 PA_CHROME_SRWLOCK lock_ = SRWLOCK_INIT;
103 #elif BUILDFLAG(IS_APPLE)
104 os_unfair_lock unfair_lock_ = OS_UNFAIR_LOCK_INIT;
105 #elif BUILDFLAG(IS_POSIX)
106 pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER;
107 #elif BUILDFLAG(IS_FUCHSIA)
108 sync_mutex lock_;
109 #endif
110
111 #else // PA_CONFIG(HAS_FAST_MUTEX)
112 std::atomic<bool> lock_{false};
113
114 // Spinlock-like, fallback.
115 PA_ALWAYS_INLINE bool TrySpinLock();
116 PA_ALWAYS_INLINE void ReleaseSpinLock();
117 void LockSlowSpinLock();
118 #endif // PA_CONFIG(HAS_FAST_MUTEX)
119 };
120
Acquire()121 PA_ALWAYS_INLINE void SpinningMutex::Acquire() {
122 // Not marked PA_LIKELY(), as:
123 // 1. We don't know how much contention the lock would experience
124 // 2. This may lead to weird-looking code layout when inlined into a caller
125 // with PA_(UN)LIKELY() annotations.
126 if (Try()) {
127 return;
128 }
129
130 return AcquireSpinThenBlock();
131 }
132
133 inline constexpr SpinningMutex::SpinningMutex() = default;
134
135 #if PA_CONFIG(HAS_FAST_MUTEX)
136
137 #if PA_CONFIG(HAS_LINUX_KERNEL)
138
Try()139 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
140 // Using the weak variant of compare_exchange(), which may fail spuriously. On
141 // some architectures such as ARM, CAS is typically performed as a LDREX/STREX
142 // pair, where the store may fail. In the strong version, there is a loop
143 // inserted by the compiler to retry in these cases.
144 //
145 // Since we are retrying in Lock() anyway, there is no point having two nested
146 // loops.
147 int expected = kUnlocked;
148 return (state_.load(std::memory_order_relaxed) == expected) &&
149 state_.compare_exchange_weak(expected, kLockedUncontended,
150 std::memory_order_acquire,
151 std::memory_order_relaxed);
152 }
153
Release()154 PA_ALWAYS_INLINE void SpinningMutex::Release() {
155 if (PA_UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
156 kLockedContended)) {
157 // |kLockedContended|: there is a waiter to wake up.
158 //
159 // Here there is a window where the lock is unlocked, since we just set it
160 // to |kUnlocked| above. Meaning that another thread can grab the lock
161 // in-between now and |FutexWake()| waking up a waiter. Aside from
162 // potentially fairness, this is not an issue, as the newly-awaken thread
163 // will check that the lock is still free.
164 //
165 // There is a small pessimization here though: if we have a single waiter,
166 // then when it wakes up, the lock will be set to |kLockedContended|, so
167 // when this waiter releases the lock, it will needlessly call
168 // |FutexWake()|, even though there are no waiters. This is supported by the
169 // kernel, and is what bionic (Android's libc) also does.
170 FutexWake();
171 }
172 }
173
174 #elif BUILDFLAG(IS_WIN)
175
Try()176 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
177 return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
178 }
179
Release()180 PA_ALWAYS_INLINE void SpinningMutex::Release() {
181 ::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
182 }
183
184 #elif BUILDFLAG(IS_APPLE)
185
Try()186 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
187 return os_unfair_lock_trylock(&unfair_lock_);
188 }
189
Release()190 PA_ALWAYS_INLINE void SpinningMutex::Release() {
191 return os_unfair_lock_unlock(&unfair_lock_);
192 }
193
194 #elif BUILDFLAG(IS_POSIX)
195
Try()196 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
197 int retval = pthread_mutex_trylock(&lock_);
198 PA_DCHECK(retval == 0 || retval == EBUSY);
199 return retval == 0;
200 }
201
Release()202 PA_ALWAYS_INLINE void SpinningMutex::Release() {
203 int retval = pthread_mutex_unlock(&lock_);
204 PA_DCHECK(retval == 0);
205 }
206
207 #elif BUILDFLAG(IS_FUCHSIA)
208
Try()209 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
210 return sync_mutex_trylock(&lock_) == ZX_OK;
211 }
212
Release()213 PA_ALWAYS_INLINE void SpinningMutex::Release() {
214 sync_mutex_unlock(&lock_);
215 }
216
217 #endif
218
219 #else // PA_CONFIG(HAS_FAST_MUTEX)
220
Try()221 PA_ALWAYS_INLINE bool SpinningMutex::Try() {
222 // Possibly faster than CAS. The theory is that if the cacheline is shared,
223 // then it can stay shared, for the contended case.
224 return !lock_.load(std::memory_order_relaxed) &&
225 !lock_.exchange(true, std::memory_order_acquire);
226 }
227
Release()228 PA_ALWAYS_INLINE void SpinningMutex::Release() {
229 lock_.store(false, std::memory_order_release);
230 }
231
LockSlow()232 PA_ALWAYS_INLINE void SpinningMutex::LockSlow() {
233 return LockSlowSpinLock();
234 }
235
236 #endif // PA_CONFIG(HAS_FAST_MUTEX)
237
238 } // namespace partition_alloc::internal
239
240 #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
241