1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11
12 #include "base/allocator/buildflags.h"
13 #include "base/allocator/dispatcher/reentry_guard.h"
14 #include "base/allocator/dispatcher/tls.h"
15 #include "base/check.h"
16 #include "base/compiler_specific.h"
17 #include "base/no_destructor.h"
18 #include "base/rand_util.h"
19 #include "base/ranges/algorithm.h"
20 #include "build/build_config.h"
21 #include "third_party/abseil-cpp/absl/base/attributes.h"
22
23 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
24 #include "base/allocator/dispatcher/standard_hooks.h"
25 #endif
26
27 namespace base {
28
29 namespace {
30
31 using ::base::allocator::dispatcher::ReentryGuard;
32
33 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
34
35 const intptr_t kAccumulatedBytesOffset = 1 << 29;
36
37 // Controls if sample intervals should not be randomized. Used for testing.
38 bool g_deterministic = false;
39
40 // Pointer to the current |LockFreeAddressHashSet|.
41 std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{nullptr};
42
43 // Sampling interval parameter, the mean value for intervals between samples.
44 std::atomic_size_t g_sampling_interval{kDefaultSamplingIntervalBytes};
45
46 struct ThreadLocalData {
47 // Accumulated bytes towards sample.
48 intptr_t accumulated_bytes = 0;
49 // Used as a workaround to avoid bias from muted samples. See
50 // ScopedMuteThreadSamples for more details.
51 intptr_t accumulated_bytes_snapshot = 0;
52 // PoissonAllocationSampler performs allocations while handling a
53 // notification. The guard protects against recursions originating from these.
54 bool internal_reentry_guard = false;
55 // A boolean used to distinguish first allocation on a thread:
56 // false - first allocation on the thread;
57 // true - otherwise.
58 // Since accumulated_bytes is initialized with zero the very first
59 // allocation on a thread would always trigger the sample, thus skewing the
60 // profile towards such allocations. To mitigate that we use the flag to
61 // ensure the first allocation is properly accounted.
62 bool sampling_interval_initialized = false;
63 };
64
GetThreadLocalData()65 ThreadLocalData* GetThreadLocalData() {
66 #if USE_LOCAL_TLS_EMULATION()
67 // If available, use ThreadLocalStorage to bypass dependencies introduced by
68 // Clang's implementation of thread_local.
69 static base::NoDestructor<
70 base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
71 thread_local_data;
72 return thread_local_data->GetThreadLocalData();
73 #else
74 // Notes on TLS usage:
75 //
76 // * There's no safe way to use TLS in malloc() as both C++ thread_local and
77 // pthread do not pose any guarantees on whether they allocate or not.
78 // * We think that we can safely use thread_local w/o re-entrancy guard
79 // because the compiler will use "tls static access model" for static builds
80 // of Chrome [https://www.uclibc.org/docs/tls.pdf].
81 // But there's no guarantee that this will stay true, and in practice
82 // it seems to have problems on macOS/Android. These platforms do allocate
83 // on the very first access to a thread_local on each thread.
84 // * Directly using/warming-up platform TLS seems to work on all platforms,
85 // but is also not guaranteed to stay true. We make use of it for reentrancy
86 // guards on macOS/Android.
87 // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
88 // GetLastError.
89 //
90 // Android thread_local seems to be using __emutls_get_address from libgcc:
91 // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
92 // macOS version is based on _tlv_get_addr from dyld:
93 // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
94 thread_local ThreadLocalData thread_local_data;
95 return &thread_local_data;
96 #endif
97 }
98
99 } // namespace
100
ScopedMuteThreadSamples()101 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
102 ThreadLocalData* const thread_local_data = GetThreadLocalData();
103
104 DCHECK(!thread_local_data->internal_reentry_guard);
105 thread_local_data->internal_reentry_guard = true;
106
107 // We mute thread samples immediately after taking a sample, which is when we
108 // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
109 // of the poisson process, and causes us to systematically overcount all other
110 // allocations. That's because muted allocations rarely trigger a sample
111 // [which would cause them to be ignored] since they occur right after
112 // g_tls_accumulated_bytes is reset.
113 //
114 // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
115 // amount to lower the probability that a sample is taken to close to 0. Then
116 // we reset it after we're done muting thread samples.
117 thread_local_data->accumulated_bytes_snapshot =
118 thread_local_data->accumulated_bytes;
119 thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
120 }
121
~ScopedMuteThreadSamples()122 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
123 ThreadLocalData* const thread_local_data = GetThreadLocalData();
124 DCHECK(thread_local_data->internal_reentry_guard);
125 thread_local_data->internal_reentry_guard = false;
126 thread_local_data->accumulated_bytes =
127 thread_local_data->accumulated_bytes_snapshot;
128 }
129
130 // static
IsMuted()131 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
132 ThreadLocalData* const thread_local_data = GetThreadLocalData();
133 return thread_local_data->internal_reentry_guard;
134 }
135
136 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()137 ScopedSuppressRandomnessForTesting() {
138 DCHECK(!g_deterministic);
139 g_deterministic = true;
140 // The accumulated_bytes may contain a random value from previous
141 // test runs, which would make the behaviour of the next call to
142 // RecordAlloc unpredictable.
143 ThreadLocalData* const thread_local_data = GetThreadLocalData();
144 thread_local_data->accumulated_bytes = 0;
145 }
146
147 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()148 ~ScopedSuppressRandomnessForTesting() {
149 DCHECK(g_deterministic);
150 g_deterministic = false;
151 }
152
153 // static
154 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()155 IsSuppressed() {
156 return g_deterministic;
157 }
158
159 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()160 ScopedMuteHookedSamplesForTesting() {
161 SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
162
163 // Reset the accumulated bytes to 0 on this thread.
164 ThreadLocalData* const thread_local_data = GetThreadLocalData();
165 accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
166 thread_local_data->accumulated_bytes = 0;
167 }
168
169 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()170 ~ScopedMuteHookedSamplesForTesting() {
171 // Restore the accumulated bytes.
172 ThreadLocalData* const thread_local_data = GetThreadLocalData();
173 thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
174 ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
175 }
176
177 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
178 // static
179 PoissonAllocationSampler* PoissonAllocationSampler::instance_ = nullptr;
180 #endif
181
182 // static
183 ABSL_CONST_INIT std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
184 PoissonAllocationSampler::profiling_state_{0};
185
PoissonAllocationSampler()186 PoissonAllocationSampler::PoissonAllocationSampler() {
187 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
188 CHECK_EQ(nullptr, instance_);
189 instance_ = this;
190 #endif
191
192 Init();
193 auto* sampled_addresses = new LockFreeAddressHashSet(64);
194 g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
195
196 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
197 // Install the allocator hooks immediately, to better match the behaviour
198 // of base::allocator::Initializer.
199 //
200 // TODO(crbug/1137393): Use base::allocator::Initializer to install the
201 // PoissonAllocationSampler hooks. All observers need to be passed to the
202 // initializer at the same time so this will install the hooks even
203 // earlier in process startup.
204 allocator::dispatcher::InstallStandardAllocatorHooks();
205 #endif
206 }
207
208 // static
Init()209 void PoissonAllocationSampler::Init() {
210 [[maybe_unused]] static bool init_once = []() {
211 // Touch thread local data on initialization to enforce proper setup of
212 // underlying storage system.
213 GetThreadLocalData();
214 ReentryGuard::InitTLSSlot();
215 return true;
216 }();
217 }
218
SetSamplingInterval(size_t sampling_interval_bytes)219 void PoissonAllocationSampler::SetSamplingInterval(
220 size_t sampling_interval_bytes) {
221 // TODO(alph): Reset the sample being collected if running.
222 g_sampling_interval = sampling_interval_bytes;
223 }
224
SamplingInterval() const225 size_t PoissonAllocationSampler::SamplingInterval() const {
226 return g_sampling_interval.load(std::memory_order_relaxed);
227 }
228
229 // static
GetNextSampleInterval(size_t interval)230 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
231 if (UNLIKELY(g_deterministic)) {
232 return interval;
233 }
234
235 // We sample with a Poisson process, with constant average sampling
236 // interval. This follows the exponential probability distribution with
237 // parameter λ = 1/interval where |interval| is the average number of bytes
238 // between samples.
239 // Let u be a uniformly distributed random number (0,1], then
240 // next_sample = -ln(u) / λ
241 // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
242 // avoid a possible floating point exception from taking the log of 0.
243 // The allocator shim uses the PoissonAllocationSampler, hence avoid
244 // allocation to avoid infinite recursion.
245 double uniform = internal::RandDoubleAvoidAllocation();
246 double value = -log(1 - uniform) * interval;
247 size_t min_value = sizeof(intptr_t);
248 // We limit the upper bound of a sample interval to make sure we don't have
249 // huge gaps in the sampling stream. Probability of the upper bound gets hit
250 // is exp(-20) ~ 2e-9, so it should not skew the distribution.
251 size_t max_value = interval * 20;
252 if (UNLIKELY(value < min_value)) {
253 return min_value;
254 }
255 if (UNLIKELY(value > max_value)) {
256 return max_value;
257 }
258 return static_cast<size_t>(value);
259 }
260
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)261 void PoissonAllocationSampler::DoRecordAllocation(
262 const ProfilingStateFlagMask state,
263 void* address,
264 size_t size,
265 base::allocator::dispatcher::AllocationSubsystem type,
266 const char* context) {
267 ThreadLocalData* const thread_local_data = GetThreadLocalData();
268
269 thread_local_data->accumulated_bytes += size;
270 intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
271 if (LIKELY(accumulated_bytes < 0)) {
272 return;
273 }
274
275 if (UNLIKELY(!(state & ProfilingStateFlag::kIsRunning))) {
276 // Sampling was in fact disabled when the hook was called. Reset the state
277 // of the sampler. We do this check off the fast-path, because it's quite a
278 // rare state when the sampler is stopped after it's started. (The most
279 // common caller of PoissonAllocationSampler starts it and leaves it running
280 // for the rest of the Chrome session.)
281 thread_local_data->sampling_interval_initialized = false;
282 thread_local_data->accumulated_bytes = 0;
283 return;
284 }
285
286 // Failed allocation? Skip the sample.
287 if (UNLIKELY(!address)) {
288 return;
289 }
290
291 size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
292 if (UNLIKELY(!thread_local_data->sampling_interval_initialized)) {
293 thread_local_data->sampling_interval_initialized = true;
294 // This is the very first allocation on the thread. It always makes it
295 // passing the condition at |RecordAlloc|, because accumulated_bytes
296 // is initialized with zero due to TLS semantics.
297 // Generate proper sampling interval instance and make sure the allocation
298 // has indeed crossed the threshold before counting it as a sample.
299 accumulated_bytes -= GetNextSampleInterval(mean_interval);
300 if (accumulated_bytes < 0) {
301 thread_local_data->accumulated_bytes = accumulated_bytes;
302 return;
303 }
304 }
305
306 // This cast is safe because this function is only called with a positive
307 // value of `accumulated_bytes`.
308 size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
309 accumulated_bytes %= mean_interval;
310
311 do {
312 accumulated_bytes -= GetNextSampleInterval(mean_interval);
313 ++samples;
314 } while (accumulated_bytes >= 0);
315
316 thread_local_data->accumulated_bytes = accumulated_bytes;
317
318 if (UNLIKELY(ScopedMuteThreadSamples::IsMuted())) {
319 return;
320 }
321
322 ScopedMuteThreadSamples no_reentrancy_scope;
323 std::vector<SamplesObserver*> observers_copy;
324 {
325 AutoLock lock(mutex_);
326
327 // TODO(alph): Sometimes RecordAlloc is called twice in a row without
328 // a RecordFree in between. Investigate it.
329 if (sampled_addresses_set().Contains(address)) {
330 return;
331 }
332 sampled_addresses_set().Insert(address);
333 BalanceAddressesHashSet();
334 observers_copy = observers_;
335 }
336
337 size_t total_allocated = mean_interval * samples;
338 for (auto* observer : observers_copy) {
339 observer->SampleAdded(address, size, total_allocated, type, context);
340 }
341 }
342
DoRecordFree(void * address)343 void PoissonAllocationSampler::DoRecordFree(void* address) {
344 // There is a rare case on macOS and Android when the very first thread_local
345 // access in ScopedMuteThreadSamples constructor may allocate and
346 // thus reenter DoRecordAlloc. However the call chain won't build up further
347 // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
348 ScopedMuteThreadSamples no_reentrancy_scope;
349 std::vector<SamplesObserver*> observers_copy;
350 {
351 AutoLock lock(mutex_);
352 observers_copy = observers_;
353 sampled_addresses_set().Remove(address);
354 }
355 for (auto* observer : observers_copy) {
356 observer->SampleRemoved(address);
357 }
358 }
359
BalanceAddressesHashSet()360 void PoissonAllocationSampler::BalanceAddressesHashSet() {
361 // Check if the load_factor of the current addresses hash set becomes higher
362 // than 1, allocate a new twice larger one, copy all the data,
363 // and switch to using it.
364 // During the copy process no other writes are made to both sets
365 // as it's behind the lock.
366 // All the readers continue to use the old one until the atomic switch
367 // process takes place.
368 LockFreeAddressHashSet& current_set = sampled_addresses_set();
369 if (current_set.load_factor() < 1) {
370 return;
371 }
372 auto new_set =
373 std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
374 new_set->Copy(current_set);
375 // Atomically switch all the new readers to the new set.
376 g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
377 // We leak the older set because we still have to keep all the old maps alive
378 // as there might be reader threads that have already obtained the map,
379 // but haven't yet managed to access it.
380 }
381
382 // static
sampled_addresses_set()383 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
384 return *g_sampled_addresses_set.load(std::memory_order_acquire);
385 }
386
387 // static
Get()388 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
389 static NoDestructor<PoissonAllocationSampler> instance;
390 return instance.get();
391 }
392
393 // static
SetProfilingStateFlag(ProfilingStateFlag flag)394 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
395 ProfilingStateFlagMask flags = flag;
396 if (flag == ProfilingStateFlag::kIsRunning) {
397 flags |= ProfilingStateFlag::kWasStarted;
398 }
399 ProfilingStateFlagMask old_state =
400 profiling_state_.fetch_or(flags, std::memory_order_relaxed);
401 DCHECK(!(old_state & flag));
402 }
403
404 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)405 void PoissonAllocationSampler::ResetProfilingStateFlag(
406 ProfilingStateFlag flag) {
407 DCHECK_NE(flag, kWasStarted);
408 ProfilingStateFlagMask old_state =
409 profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
410 DCHECK(old_state & flag);
411 }
412
AddSamplesObserver(SamplesObserver * observer)413 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
414 // The following implementation (including ScopedMuteThreadSamples) will use
415 // `thread_local`, which may cause a reentrancy issue. So, temporarily
416 // disable the sampling by having a ReentryGuard.
417 ReentryGuard guard;
418
419 ScopedMuteThreadSamples no_reentrancy_scope;
420 AutoLock lock(mutex_);
421 DCHECK(ranges::find(observers_, observer) == observers_.end());
422 bool profiler_was_stopped = observers_.empty();
423 observers_.push_back(observer);
424
425 // Adding the observer will enable profiling. This will use
426 // `g_sampled_address_set` so it had better be initialized.
427 DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
428
429 // Start the profiler if this was the first observer. Setting/resetting
430 // kIsRunning isn't racy because it's performed based on `observers_.empty()`
431 // while holding `mutex_`.
432 if (profiler_was_stopped) {
433 SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
434 }
435 DCHECK(profiling_state_.load(std::memory_order_relaxed) &
436 ProfilingStateFlag::kIsRunning);
437 }
438
RemoveSamplesObserver(SamplesObserver * observer)439 void PoissonAllocationSampler::RemoveSamplesObserver(
440 SamplesObserver* observer) {
441 // The following implementation (including ScopedMuteThreadSamples) will use
442 // `thread_local`, which may cause a reentrancy issue. So, temporarily
443 // disable the sampling by having a ReentryGuard.
444 ReentryGuard guard;
445
446 ScopedMuteThreadSamples no_reentrancy_scope;
447 AutoLock lock(mutex_);
448 auto it = ranges::find(observers_, observer);
449 DCHECK(it != observers_.end());
450 observers_.erase(it);
451
452 // Stop the profiler if there are no more observers. Setting/resetting
453 // kIsRunning isn't racy because it's performed based on `observers_.empty()`
454 // while holding `mutex_`.
455 DCHECK(profiling_state_.load(std::memory_order_relaxed) &
456 ProfilingStateFlag::kIsRunning);
457 if (observers_.empty()) {
458 ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
459 }
460 }
461
462 } // namespace base
463