// Copyright 2020 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_ #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_ #include #include #include "base/allocator/partition_allocator/partition_alloc_base/compiler_specific.h" #include "base/allocator/partition_allocator/partition_alloc_base/component_export.h" #include "base/allocator/partition_allocator/partition_alloc_base/thread_annotations.h" #include "base/allocator/partition_allocator/partition_alloc_check.h" #include "base/allocator/partition_allocator/partition_alloc_config.h" #include "base/allocator/partition_allocator/yield_processor.h" #include "build/build_config.h" #if BUILDFLAG(IS_WIN) #include "base/allocator/partition_allocator/partition_alloc_base/win/windows_types.h" #endif #if BUILDFLAG(IS_POSIX) #include #include #endif #if BUILDFLAG(IS_APPLE) #include #endif // BUILDFLAG(IS_APPLE) #if BUILDFLAG(IS_FUCHSIA) #include #endif namespace partition_alloc::internal { // The behavior of this class depends on whether PA_HAS_FAST_MUTEX is defined. // 1. When it is defined: // // Simple spinning lock. It will spin in user space a set number of times before // going into the kernel to sleep. // // This is intended to give "the best of both worlds" between a SpinLock and // base::Lock: // - SpinLock: Inlined fast path, no external function calls, just // compare-and-swap. Short waits do not go into the kernel. Good behavior in // low contention cases. // - base::Lock: Good behavior in case of contention. // // We don't rely on base::Lock which we could make spin (by calling Try() in a // loop), as performance is below a custom spinlock as seen on high-level // benchmarks. Instead this implements a simple non-recursive mutex on top of // the futex() syscall on Linux, SRWLock on Windows, os_unfair_lock on macOS, // and pthread_mutex on POSIX. The main difference between this and a libc // implementation is that it only supports the simplest path: private (to a // process), non-recursive mutexes with no priority inheritance, no timed waits. // // As an interesting side-effect to be used in the allocator, this code does not // make any allocations, locks are small with a constexpr constructor and no // destructor. // // 2. Otherwise: This is a simple SpinLock, in the sense that it does not have // any awareness of other threads' behavior. class PA_LOCKABLE PA_COMPONENT_EXPORT(PARTITION_ALLOC) SpinningMutex { public: inline constexpr SpinningMutex(); PA_ALWAYS_INLINE void Acquire() PA_EXCLUSIVE_LOCK_FUNCTION(); PA_ALWAYS_INLINE void Release() PA_UNLOCK_FUNCTION(); PA_ALWAYS_INLINE bool Try() PA_EXCLUSIVE_TRYLOCK_FUNCTION(true); void AssertAcquired() const {} // Not supported. void Reinit() PA_UNLOCK_FUNCTION(); private: PA_NOINLINE void AcquireSpinThenBlock() PA_EXCLUSIVE_LOCK_FUNCTION(); #if PA_CONFIG(HAS_FAST_MUTEX) void LockSlow() PA_EXCLUSIVE_LOCK_FUNCTION(); #else PA_ALWAYS_INLINE void LockSlow() PA_EXCLUSIVE_LOCK_FUNCTION(); #endif // See below, the latency of PA_YIELD_PROCESSOR can be as high as ~150 // cycles. Meanwhile, sleeping costs a few us. Spinning 64 times at 3GHz would // cost 150 * 64 / 3e9 ~= 3.2us. // // This applies to Linux kernels, on x86_64. On ARM we might want to spin // more. static constexpr int kSpinCount = 64; #if PA_CONFIG(HAS_FAST_MUTEX) #if PA_CONFIG(HAS_LINUX_KERNEL) void FutexWait(); void FutexWake(); static constexpr int kUnlocked = 0; static constexpr int kLockedUncontended = 1; static constexpr int kLockedContended = 2; std::atomic state_{kUnlocked}; #elif BUILDFLAG(IS_WIN) PA_CHROME_SRWLOCK lock_ = SRWLOCK_INIT; #elif BUILDFLAG(IS_APPLE) os_unfair_lock unfair_lock_ = OS_UNFAIR_LOCK_INIT; #elif BUILDFLAG(IS_POSIX) pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER; #elif BUILDFLAG(IS_FUCHSIA) sync_mutex lock_; #endif #else // PA_CONFIG(HAS_FAST_MUTEX) std::atomic lock_{false}; // Spinlock-like, fallback. PA_ALWAYS_INLINE bool TrySpinLock(); PA_ALWAYS_INLINE void ReleaseSpinLock(); void LockSlowSpinLock(); #endif // PA_CONFIG(HAS_FAST_MUTEX) }; PA_ALWAYS_INLINE void SpinningMutex::Acquire() { // Not marked PA_LIKELY(), as: // 1. We don't know how much contention the lock would experience // 2. This may lead to weird-looking code layout when inlined into a caller // with PA_(UN)LIKELY() annotations. if (Try()) { return; } return AcquireSpinThenBlock(); } inline constexpr SpinningMutex::SpinningMutex() = default; #if PA_CONFIG(HAS_FAST_MUTEX) #if PA_CONFIG(HAS_LINUX_KERNEL) PA_ALWAYS_INLINE bool SpinningMutex::Try() { // Using the weak variant of compare_exchange(), which may fail spuriously. On // some architectures such as ARM, CAS is typically performed as a LDREX/STREX // pair, where the store may fail. In the strong version, there is a loop // inserted by the compiler to retry in these cases. // // Since we are retrying in Lock() anyway, there is no point having two nested // loops. int expected = kUnlocked; return (state_.load(std::memory_order_relaxed) == expected) && state_.compare_exchange_weak(expected, kLockedUncontended, std::memory_order_acquire, std::memory_order_relaxed); } PA_ALWAYS_INLINE void SpinningMutex::Release() { if (PA_UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) == kLockedContended)) { // |kLockedContended|: there is a waiter to wake up. // // Here there is a window where the lock is unlocked, since we just set it // to |kUnlocked| above. Meaning that another thread can grab the lock // in-between now and |FutexWake()| waking up a waiter. Aside from // potentially fairness, this is not an issue, as the newly-awaken thread // will check that the lock is still free. // // There is a small pessimization here though: if we have a single waiter, // then when it wakes up, the lock will be set to |kLockedContended|, so // when this waiter releases the lock, it will needlessly call // |FutexWake()|, even though there are no waiters. This is supported by the // kernel, and is what bionic (Android's libc) also does. FutexWake(); } } #elif BUILDFLAG(IS_WIN) PA_ALWAYS_INLINE bool SpinningMutex::Try() { return !!::TryAcquireSRWLockExclusive(reinterpret_cast(&lock_)); } PA_ALWAYS_INLINE void SpinningMutex::Release() { ::ReleaseSRWLockExclusive(reinterpret_cast(&lock_)); } #elif BUILDFLAG(IS_APPLE) PA_ALWAYS_INLINE bool SpinningMutex::Try() { return os_unfair_lock_trylock(&unfair_lock_); } PA_ALWAYS_INLINE void SpinningMutex::Release() { return os_unfair_lock_unlock(&unfair_lock_); } #elif BUILDFLAG(IS_POSIX) PA_ALWAYS_INLINE bool SpinningMutex::Try() { int retval = pthread_mutex_trylock(&lock_); PA_DCHECK(retval == 0 || retval == EBUSY); return retval == 0; } PA_ALWAYS_INLINE void SpinningMutex::Release() { int retval = pthread_mutex_unlock(&lock_); PA_DCHECK(retval == 0); } #elif BUILDFLAG(IS_FUCHSIA) PA_ALWAYS_INLINE bool SpinningMutex::Try() { return sync_mutex_trylock(&lock_) == ZX_OK; } PA_ALWAYS_INLINE void SpinningMutex::Release() { sync_mutex_unlock(&lock_); } #endif #else // PA_CONFIG(HAS_FAST_MUTEX) PA_ALWAYS_INLINE bool SpinningMutex::Try() { // Possibly faster than CAS. The theory is that if the cacheline is shared, // then it can stay shared, for the contended case. return !lock_.load(std::memory_order_relaxed) && !lock_.exchange(true, std::memory_order_acquire); } PA_ALWAYS_INLINE void SpinningMutex::Release() { lock_.store(false, std::memory_order_release); } PA_ALWAYS_INLINE void SpinningMutex::LockSlow() { return LockSlowSpinLock(); } #endif // PA_CONFIG(HAS_FAST_MUTEX) } // namespace partition_alloc::internal #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_