1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11
12 #include "base/allocator/dispatcher/reentry_guard.h"
13 #include "base/allocator/dispatcher/tls.h"
14 #include "base/check.h"
15 #include "base/compiler_specific.h"
16 #include "base/no_destructor.h"
17 #include "base/rand_util.h"
18 #include "base/ranges/algorithm.h"
19 #include "build/build_config.h"
20
21 namespace base {
22
23 namespace {
24
25 using ::base::allocator::dispatcher::ReentryGuard;
26
27 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
28
29 const intptr_t kAccumulatedBytesOffset = 1 << 29;
30
31 // Controls if sample intervals should not be randomized. Used for testing.
32 bool g_deterministic = false;
33
34 // Pointer to the current |LockFreeAddressHashSet|.
35 constinit std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{nullptr};
36
37 // Sampling interval parameter, the mean value for intervals between samples.
38 constinit std::atomic_size_t g_sampling_interval{kDefaultSamplingIntervalBytes};
39
40 struct ThreadLocalData {
41 // Accumulated bytes towards sample.
42 intptr_t accumulated_bytes = 0;
43 // Used as a workaround to avoid bias from muted samples. See
44 // ScopedMuteThreadSamples for more details.
45 intptr_t accumulated_bytes_snapshot = 0;
46 // PoissonAllocationSampler performs allocations while handling a
47 // notification. The guard protects against recursions originating from these.
48 bool internal_reentry_guard = false;
49 // A boolean used to distinguish first allocation on a thread:
50 // false - first allocation on the thread;
51 // true - otherwise.
52 // Since accumulated_bytes is initialized with zero the very first
53 // allocation on a thread would always trigger the sample, thus skewing the
54 // profile towards such allocations. To mitigate that we use the flag to
55 // ensure the first allocation is properly accounted.
56 bool sampling_interval_initialized = false;
57 };
58
GetThreadLocalData()59 ThreadLocalData* GetThreadLocalData() {
60 #if USE_LOCAL_TLS_EMULATION()
61 // If available, use ThreadLocalStorage to bypass dependencies introduced by
62 // Clang's implementation of thread_local.
63 static base::NoDestructor<
64 base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
65 thread_local_data("poisson_allocation_sampler");
66 return thread_local_data->GetThreadLocalData();
67 #else
68 // Notes on TLS usage:
69 //
70 // * There's no safe way to use TLS in malloc() as both C++ thread_local and
71 // pthread do not pose any guarantees on whether they allocate or not.
72 // * We think that we can safely use thread_local w/o re-entrancy guard
73 // because the compiler will use "tls static access model" for static builds
74 // of Chrome [https://www.uclibc.org/docs/tls.pdf].
75 // But there's no guarantee that this will stay true, and in practice
76 // it seems to have problems on macOS/Android. These platforms do allocate
77 // on the very first access to a thread_local on each thread.
78 // * Directly using/warming-up platform TLS seems to work on all platforms,
79 // but is also not guaranteed to stay true. We make use of it for reentrancy
80 // guards on macOS/Android.
81 // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
82 // GetLastError.
83 //
84 // Android thread_local seems to be using __emutls_get_address from libgcc:
85 // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
86 // macOS version is based on _tlv_get_addr from dyld:
87 // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
88 thread_local ThreadLocalData thread_local_data;
89 return &thread_local_data;
90 #endif
91 }
92
93 } // namespace
94
ScopedMuteThreadSamples()95 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
96 ThreadLocalData* const thread_local_data = GetThreadLocalData();
97
98 DCHECK(!thread_local_data->internal_reentry_guard);
99 thread_local_data->internal_reentry_guard = true;
100
101 // We mute thread samples immediately after taking a sample, which is when we
102 // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
103 // of the poisson process, and causes us to systematically overcount all other
104 // allocations. That's because muted allocations rarely trigger a sample
105 // [which would cause them to be ignored] since they occur right after
106 // g_tls_accumulated_bytes is reset.
107 //
108 // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
109 // amount to lower the probability that a sample is taken to close to 0. Then
110 // we reset it after we're done muting thread samples.
111 thread_local_data->accumulated_bytes_snapshot =
112 thread_local_data->accumulated_bytes;
113 thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
114 }
115
~ScopedMuteThreadSamples()116 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
117 ThreadLocalData* const thread_local_data = GetThreadLocalData();
118 DCHECK(thread_local_data->internal_reentry_guard);
119 thread_local_data->internal_reentry_guard = false;
120 thread_local_data->accumulated_bytes =
121 thread_local_data->accumulated_bytes_snapshot;
122 }
123
124 // static
IsMuted()125 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
126 ThreadLocalData* const thread_local_data = GetThreadLocalData();
127 return thread_local_data->internal_reentry_guard;
128 }
129
130 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()131 ScopedSuppressRandomnessForTesting() {
132 DCHECK(!g_deterministic);
133 g_deterministic = true;
134 // The accumulated_bytes may contain a random value from previous
135 // test runs, which would make the behaviour of the next call to
136 // RecordAlloc unpredictable.
137 ThreadLocalData* const thread_local_data = GetThreadLocalData();
138 thread_local_data->accumulated_bytes = 0;
139 }
140
141 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()142 ~ScopedSuppressRandomnessForTesting() {
143 DCHECK(g_deterministic);
144 g_deterministic = false;
145 }
146
147 // static
148 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()149 IsSuppressed() {
150 return g_deterministic;
151 }
152
153 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()154 ScopedMuteHookedSamplesForTesting() {
155 SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
156
157 // Reset the accumulated bytes to 0 on this thread.
158 ThreadLocalData* const thread_local_data = GetThreadLocalData();
159 accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
160 thread_local_data->accumulated_bytes = 0;
161 }
162
163 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()164 ~ScopedMuteHookedSamplesForTesting() {
165 // Restore the accumulated bytes.
166 ThreadLocalData* const thread_local_data = GetThreadLocalData();
167 thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
168 ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
169 }
170
171 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
172 ScopedMuteHookedSamplesForTesting(ScopedMuteHookedSamplesForTesting&&) =
173 default;
174
175 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting&
176 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::operator=(
177 ScopedMuteHookedSamplesForTesting&&) = default;
178
179 // static
180 constinit std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
181 PoissonAllocationSampler::profiling_state_{0};
182
PoissonAllocationSampler()183 PoissonAllocationSampler::PoissonAllocationSampler() {
184 Init();
185 auto* sampled_addresses = new LockFreeAddressHashSet(64);
186 g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
187 }
188
189 // static
Init()190 void PoissonAllocationSampler::Init() {
191 [[maybe_unused]] static bool init_once = [] {
192 // Touch thread local data on initialization to enforce proper setup of
193 // underlying storage system.
194 GetThreadLocalData();
195 ReentryGuard::InitTLSSlot();
196 return true;
197 }();
198 }
199
SetSamplingInterval(size_t sampling_interval_bytes)200 void PoissonAllocationSampler::SetSamplingInterval(
201 size_t sampling_interval_bytes) {
202 // TODO(alph): Reset the sample being collected if running.
203 g_sampling_interval.store(sampling_interval_bytes, std::memory_order_relaxed);
204 }
205
SamplingInterval() const206 size_t PoissonAllocationSampler::SamplingInterval() const {
207 return g_sampling_interval.load(std::memory_order_relaxed);
208 }
209
210 // static
GetNextSampleInterval(size_t interval)211 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
212 if (g_deterministic) [[unlikely]] {
213 return interval;
214 }
215
216 // We sample with a Poisson process, with constant average sampling
217 // interval. This follows the exponential probability distribution with
218 // parameter λ = 1/interval where |interval| is the average number of bytes
219 // between samples.
220 // Let u be a uniformly distributed random number (0,1], then
221 // next_sample = -ln(u) / λ
222 // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
223 // avoid a possible floating point exception from taking the log of 0.
224 // The allocator shim uses the PoissonAllocationSampler, hence avoid
225 // allocation to avoid infinite recursion.
226 double uniform = internal::RandDoubleAvoidAllocation();
227 double value = -log(1 - uniform) * interval;
228 size_t min_value = sizeof(intptr_t);
229 // We limit the upper bound of a sample interval to make sure we don't have
230 // huge gaps in the sampling stream. Probability of the upper bound gets hit
231 // is exp(-20) ~ 2e-9, so it should not skew the distribution.
232 size_t max_value = interval * 20;
233 if (value < min_value) [[unlikely]] {
234 return min_value;
235 }
236 if (value > max_value) [[unlikely]] {
237 return max_value;
238 }
239 return static_cast<size_t>(value);
240 }
241
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)242 void PoissonAllocationSampler::DoRecordAllocation(
243 const ProfilingStateFlagMask state,
244 void* address,
245 size_t size,
246 base::allocator::dispatcher::AllocationSubsystem type,
247 const char* context) {
248 ThreadLocalData* const thread_local_data = GetThreadLocalData();
249
250 thread_local_data->accumulated_bytes += size;
251 intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
252 if (accumulated_bytes < 0) [[likely]] {
253 return;
254 }
255
256 if (!(state & ProfilingStateFlag::kIsRunning)) [[unlikely]] {
257 // Sampling was in fact disabled when the hook was called. Reset the state
258 // of the sampler. We do this check off the fast-path, because it's quite a
259 // rare state when the sampler is stopped after it's started. (The most
260 // common caller of PoissonAllocationSampler starts it and leaves it running
261 // for the rest of the Chrome session.)
262 thread_local_data->sampling_interval_initialized = false;
263 thread_local_data->accumulated_bytes = 0;
264 return;
265 }
266
267 // Failed allocation? Skip the sample.
268 if (!address) [[unlikely]] {
269 return;
270 }
271
272 size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
273 if (!thread_local_data->sampling_interval_initialized) [[unlikely]] {
274 thread_local_data->sampling_interval_initialized = true;
275 // This is the very first allocation on the thread. It always makes it
276 // passing the condition at |RecordAlloc|, because accumulated_bytes
277 // is initialized with zero due to TLS semantics.
278 // Generate proper sampling interval instance and make sure the allocation
279 // has indeed crossed the threshold before counting it as a sample.
280 accumulated_bytes -= GetNextSampleInterval(mean_interval);
281 if (accumulated_bytes < 0) {
282 thread_local_data->accumulated_bytes = accumulated_bytes;
283 return;
284 }
285 }
286
287 // This cast is safe because this function is only called with a positive
288 // value of `accumulated_bytes`.
289 size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
290 accumulated_bytes %= mean_interval;
291
292 do {
293 accumulated_bytes -= GetNextSampleInterval(mean_interval);
294 ++samples;
295 } while (accumulated_bytes >= 0);
296
297 thread_local_data->accumulated_bytes = accumulated_bytes;
298
299 if (ScopedMuteThreadSamples::IsMuted()) [[unlikely]] {
300 return;
301 }
302
303 ScopedMuteThreadSamples no_reentrancy_scope;
304 std::vector<SamplesObserver*> observers_copy;
305 {
306 AutoLock lock(mutex_);
307
308 // TODO(alph): Sometimes RecordAlloc is called twice in a row without
309 // a RecordFree in between. Investigate it.
310 if (sampled_addresses_set().Contains(address)) {
311 return;
312 }
313 sampled_addresses_set().Insert(address);
314 BalanceAddressesHashSet();
315 observers_copy = observers_;
316 }
317
318 size_t total_allocated = mean_interval * samples;
319 for (base::PoissonAllocationSampler::SamplesObserver* observer :
320 observers_copy) {
321 observer->SampleAdded(address, size, total_allocated, type, context);
322 }
323 }
324
DoRecordFree(void * address)325 void PoissonAllocationSampler::DoRecordFree(void* address) {
326 // There is a rare case on macOS and Android when the very first thread_local
327 // access in ScopedMuteThreadSamples constructor may allocate and
328 // thus reenter DoRecordAlloc. However the call chain won't build up further
329 // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
330 ScopedMuteThreadSamples no_reentrancy_scope;
331 std::vector<SamplesObserver*> observers_copy;
332 {
333 AutoLock lock(mutex_);
334 observers_copy = observers_;
335 sampled_addresses_set().Remove(address);
336 }
337 for (base::PoissonAllocationSampler::SamplesObserver* observer :
338 observers_copy) {
339 observer->SampleRemoved(address);
340 }
341 }
342
BalanceAddressesHashSet()343 void PoissonAllocationSampler::BalanceAddressesHashSet() {
344 // Check if the load_factor of the current addresses hash set becomes higher
345 // than 1, allocate a new twice larger one, copy all the data,
346 // and switch to using it.
347 // During the copy process no other writes are made to both sets
348 // as it's behind the lock.
349 // All the readers continue to use the old one until the atomic switch
350 // process takes place.
351 LockFreeAddressHashSet& current_set = sampled_addresses_set();
352 if (current_set.load_factor() < 1) {
353 return;
354 }
355 auto new_set =
356 std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
357 new_set->Copy(current_set);
358 // Atomically switch all the new readers to the new set.
359 g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
360 // We leak the older set because we still have to keep all the old maps alive
361 // as there might be reader threads that have already obtained the map,
362 // but haven't yet managed to access it.
363 }
364
365 // static
sampled_addresses_set()366 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
367 return *g_sampled_addresses_set.load(std::memory_order_acquire);
368 }
369
370 // static
Get()371 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
372 static NoDestructor<PoissonAllocationSampler> instance;
373 return instance.get();
374 }
375
376 // static
SetProfilingStateFlag(ProfilingStateFlag flag)377 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
378 ProfilingStateFlagMask flags = flag;
379 if (flag == ProfilingStateFlag::kIsRunning) {
380 flags |= ProfilingStateFlag::kWasStarted;
381 }
382 ProfilingStateFlagMask old_state =
383 profiling_state_.fetch_or(flags, std::memory_order_relaxed);
384 DCHECK(!(old_state & flag));
385 }
386
387 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)388 void PoissonAllocationSampler::ResetProfilingStateFlag(
389 ProfilingStateFlag flag) {
390 DCHECK_NE(flag, kWasStarted);
391 ProfilingStateFlagMask old_state =
392 profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
393 DCHECK(old_state & flag);
394 }
395
AddSamplesObserver(SamplesObserver * observer)396 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
397 // The following implementation (including ScopedMuteThreadSamples) will use
398 // `thread_local`, which may cause a reentrancy issue. So, temporarily
399 // disable the sampling by having a ReentryGuard.
400 ReentryGuard guard;
401
402 ScopedMuteThreadSamples no_reentrancy_scope;
403 AutoLock lock(mutex_);
404 DCHECK(ranges::find(observers_, observer) == observers_.end());
405 bool profiler_was_stopped = observers_.empty();
406 observers_.push_back(observer);
407
408 // Adding the observer will enable profiling. This will use
409 // `g_sampled_address_set` so it had better be initialized.
410 DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
411
412 // Start the profiler if this was the first observer. Setting/resetting
413 // kIsRunning isn't racy because it's performed based on `observers_.empty()`
414 // while holding `mutex_`.
415 if (profiler_was_stopped) {
416 SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
417 }
418 DCHECK(profiling_state_.load(std::memory_order_relaxed) &
419 ProfilingStateFlag::kIsRunning);
420 }
421
RemoveSamplesObserver(SamplesObserver * observer)422 void PoissonAllocationSampler::RemoveSamplesObserver(
423 SamplesObserver* observer) {
424 // The following implementation (including ScopedMuteThreadSamples) will use
425 // `thread_local`, which may cause a reentrancy issue. So, temporarily
426 // disable the sampling by having a ReentryGuard.
427 ReentryGuard guard;
428
429 ScopedMuteThreadSamples no_reentrancy_scope;
430 AutoLock lock(mutex_);
431 auto it = ranges::find(observers_, observer);
432 CHECK(it != observers_.end(), base::NotFatalUntil::M125);
433 observers_.erase(it);
434
435 // Stop the profiler if there are no more observers. Setting/resetting
436 // kIsRunning isn't racy because it's performed based on `observers_.empty()`
437 // while holding `mutex_`.
438 DCHECK(profiling_state_.load(std::memory_order_relaxed) &
439 ProfilingStateFlag::kIsRunning);
440 if (observers_.empty()) {
441 ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
442 }
443 }
444
445 } // namespace base
446