• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6 
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11 
12 #include "base/allocator/dispatcher/reentry_guard.h"
13 #include "base/allocator/dispatcher/tls.h"
14 #include "base/check.h"
15 #include "base/compiler_specific.h"
16 #include "base/no_destructor.h"
17 #include "base/rand_util.h"
18 #include "base/ranges/algorithm.h"
19 #include "build/build_config.h"
20 
21 namespace base {
22 
23 namespace {
24 
25 using ::base::allocator::dispatcher::ReentryGuard;
26 
27 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
28 
29 const intptr_t kAccumulatedBytesOffset = 1 << 29;
30 
31 // Controls if sample intervals should not be randomized. Used for testing.
32 bool g_deterministic = false;
33 
34 // Pointer to the current |LockFreeAddressHashSet|.
35 constinit std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{nullptr};
36 
37 // Sampling interval parameter, the mean value for intervals between samples.
38 constinit std::atomic_size_t g_sampling_interval{kDefaultSamplingIntervalBytes};
39 
40 struct ThreadLocalData {
41   // Accumulated bytes towards sample.
42   intptr_t accumulated_bytes = 0;
43   // Used as a workaround to avoid bias from muted samples. See
44   // ScopedMuteThreadSamples for more details.
45   intptr_t accumulated_bytes_snapshot = 0;
46   // PoissonAllocationSampler performs allocations while handling a
47   // notification. The guard protects against recursions originating from these.
48   bool internal_reentry_guard = false;
49   // A boolean used to distinguish first allocation on a thread:
50   //   false - first allocation on the thread;
51   //   true  - otherwise.
52   // Since accumulated_bytes is initialized with zero the very first
53   // allocation on a thread would always trigger the sample, thus skewing the
54   // profile towards such allocations. To mitigate that we use the flag to
55   // ensure the first allocation is properly accounted.
56   bool sampling_interval_initialized = false;
57 };
58 
GetThreadLocalData()59 ThreadLocalData* GetThreadLocalData() {
60 #if USE_LOCAL_TLS_EMULATION()
61   // If available, use ThreadLocalStorage to bypass dependencies introduced by
62   // Clang's implementation of thread_local.
63   static base::NoDestructor<
64       base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
65       thread_local_data("poisson_allocation_sampler");
66   return thread_local_data->GetThreadLocalData();
67 #else
68   // Notes on TLS usage:
69   //
70   // * There's no safe way to use TLS in malloc() as both C++ thread_local and
71   //   pthread do not pose any guarantees on whether they allocate or not.
72   // * We think that we can safely use thread_local w/o re-entrancy guard
73   //   because the compiler will use "tls static access model" for static builds
74   //   of Chrome [https://www.uclibc.org/docs/tls.pdf].
75   //   But there's no guarantee that this will stay true, and in practice
76   //   it seems to have problems on macOS/Android. These platforms do allocate
77   //   on the very first access to a thread_local on each thread.
78   // * Directly using/warming-up platform TLS seems to work on all platforms,
79   //   but is also not guaranteed to stay true. We make use of it for reentrancy
80   //   guards on macOS/Android.
81   // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
82   //   GetLastError.
83   //
84   // Android thread_local seems to be using __emutls_get_address from libgcc:
85   // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
86   // macOS version is based on _tlv_get_addr from dyld:
87   // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
88   thread_local ThreadLocalData thread_local_data;
89   return &thread_local_data;
90 #endif
91 }
92 
93 }  // namespace
94 
ScopedMuteThreadSamples()95 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
96   ThreadLocalData* const thread_local_data = GetThreadLocalData();
97 
98   DCHECK(!thread_local_data->internal_reentry_guard);
99   thread_local_data->internal_reentry_guard = true;
100 
101   // We mute thread samples immediately after taking a sample, which is when we
102   // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
103   // of the poisson process, and causes us to systematically overcount all other
104   // allocations. That's because muted allocations rarely trigger a sample
105   // [which would cause them to be ignored] since they occur right after
106   // g_tls_accumulated_bytes is reset.
107   //
108   // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
109   // amount to lower the probability that a sample is taken to close to 0. Then
110   // we reset it after we're done muting thread samples.
111   thread_local_data->accumulated_bytes_snapshot =
112       thread_local_data->accumulated_bytes;
113   thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
114 }
115 
~ScopedMuteThreadSamples()116 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
117   ThreadLocalData* const thread_local_data = GetThreadLocalData();
118   DCHECK(thread_local_data->internal_reentry_guard);
119   thread_local_data->internal_reentry_guard = false;
120   thread_local_data->accumulated_bytes =
121       thread_local_data->accumulated_bytes_snapshot;
122 }
123 
124 // static
IsMuted()125 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
126   ThreadLocalData* const thread_local_data = GetThreadLocalData();
127   return thread_local_data->internal_reentry_guard;
128 }
129 
130 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()131     ScopedSuppressRandomnessForTesting() {
132   DCHECK(!g_deterministic);
133   g_deterministic = true;
134   // The accumulated_bytes may contain a random value from previous
135   // test runs, which would make the behaviour of the next call to
136   // RecordAlloc unpredictable.
137   ThreadLocalData* const thread_local_data = GetThreadLocalData();
138   thread_local_data->accumulated_bytes = 0;
139 }
140 
141 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()142     ~ScopedSuppressRandomnessForTesting() {
143   DCHECK(g_deterministic);
144   g_deterministic = false;
145 }
146 
147 // static
148 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()149     IsSuppressed() {
150   return g_deterministic;
151 }
152 
153 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()154     ScopedMuteHookedSamplesForTesting() {
155   SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
156 
157   // Reset the accumulated bytes to 0 on this thread.
158   ThreadLocalData* const thread_local_data = GetThreadLocalData();
159   accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
160   thread_local_data->accumulated_bytes = 0;
161 }
162 
163 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()164     ~ScopedMuteHookedSamplesForTesting() {
165   // Restore the accumulated bytes.
166   ThreadLocalData* const thread_local_data = GetThreadLocalData();
167   thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
168   ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
169 }
170 
171 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
172     ScopedMuteHookedSamplesForTesting(ScopedMuteHookedSamplesForTesting&&) =
173         default;
174 
175 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting&
176 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::operator=(
177     ScopedMuteHookedSamplesForTesting&&) = default;
178 
179 // static
180 constinit std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
181     PoissonAllocationSampler::profiling_state_{0};
182 
PoissonAllocationSampler()183 PoissonAllocationSampler::PoissonAllocationSampler() {
184   Init();
185   auto* sampled_addresses = new LockFreeAddressHashSet(64);
186   g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
187 }
188 
189 // static
Init()190 void PoissonAllocationSampler::Init() {
191   [[maybe_unused]] static bool init_once = [] {
192     // Touch thread local data on initialization to enforce proper setup of
193     // underlying storage system.
194     GetThreadLocalData();
195     ReentryGuard::InitTLSSlot();
196     return true;
197   }();
198 }
199 
SetSamplingInterval(size_t sampling_interval_bytes)200 void PoissonAllocationSampler::SetSamplingInterval(
201     size_t sampling_interval_bytes) {
202   // TODO(alph): Reset the sample being collected if running.
203   g_sampling_interval.store(sampling_interval_bytes, std::memory_order_relaxed);
204 }
205 
SamplingInterval() const206 size_t PoissonAllocationSampler::SamplingInterval() const {
207   return g_sampling_interval.load(std::memory_order_relaxed);
208 }
209 
210 // static
GetNextSampleInterval(size_t interval)211 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
212   if (g_deterministic) [[unlikely]] {
213     return interval;
214   }
215 
216   // We sample with a Poisson process, with constant average sampling
217   // interval. This follows the exponential probability distribution with
218   // parameter λ = 1/interval where |interval| is the average number of bytes
219   // between samples.
220   // Let u be a uniformly distributed random number (0,1], then
221   // next_sample = -ln(u) / λ
222   // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
223   // avoid a possible floating point exception from taking the log of 0.
224   // The allocator shim uses the PoissonAllocationSampler, hence avoid
225   // allocation to avoid infinite recursion.
226   double uniform = internal::RandDoubleAvoidAllocation();
227   double value = -log(1 - uniform) * interval;
228   size_t min_value = sizeof(intptr_t);
229   // We limit the upper bound of a sample interval to make sure we don't have
230   // huge gaps in the sampling stream. Probability of the upper bound gets hit
231   // is exp(-20) ~ 2e-9, so it should not skew the distribution.
232   size_t max_value = interval * 20;
233   if (value < min_value) [[unlikely]] {
234     return min_value;
235   }
236   if (value > max_value) [[unlikely]] {
237     return max_value;
238   }
239   return static_cast<size_t>(value);
240 }
241 
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)242 void PoissonAllocationSampler::DoRecordAllocation(
243     const ProfilingStateFlagMask state,
244     void* address,
245     size_t size,
246     base::allocator::dispatcher::AllocationSubsystem type,
247     const char* context) {
248   ThreadLocalData* const thread_local_data = GetThreadLocalData();
249 
250   thread_local_data->accumulated_bytes += size;
251   intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
252   if (accumulated_bytes < 0) [[likely]] {
253     return;
254   }
255 
256   if (!(state & ProfilingStateFlag::kIsRunning)) [[unlikely]] {
257     // Sampling was in fact disabled when the hook was called. Reset the state
258     // of the sampler. We do this check off the fast-path, because it's quite a
259     // rare state when the sampler is stopped after it's started. (The most
260     // common caller of PoissonAllocationSampler starts it and leaves it running
261     // for the rest of the Chrome session.)
262     thread_local_data->sampling_interval_initialized = false;
263     thread_local_data->accumulated_bytes = 0;
264     return;
265   }
266 
267   // Failed allocation? Skip the sample.
268   if (!address) [[unlikely]] {
269     return;
270   }
271 
272   size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
273   if (!thread_local_data->sampling_interval_initialized) [[unlikely]] {
274     thread_local_data->sampling_interval_initialized = true;
275     // This is the very first allocation on the thread. It always makes it
276     // passing the condition at |RecordAlloc|, because accumulated_bytes
277     // is initialized with zero due to TLS semantics.
278     // Generate proper sampling interval instance and make sure the allocation
279     // has indeed crossed the threshold before counting it as a sample.
280     accumulated_bytes -= GetNextSampleInterval(mean_interval);
281     if (accumulated_bytes < 0) {
282       thread_local_data->accumulated_bytes = accumulated_bytes;
283       return;
284     }
285   }
286 
287   // This cast is safe because this function is only called with a positive
288   // value of `accumulated_bytes`.
289   size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
290   accumulated_bytes %= mean_interval;
291 
292   do {
293     accumulated_bytes -= GetNextSampleInterval(mean_interval);
294     ++samples;
295   } while (accumulated_bytes >= 0);
296 
297   thread_local_data->accumulated_bytes = accumulated_bytes;
298 
299   if (ScopedMuteThreadSamples::IsMuted()) [[unlikely]] {
300     return;
301   }
302 
303   ScopedMuteThreadSamples no_reentrancy_scope;
304   std::vector<SamplesObserver*> observers_copy;
305   {
306     AutoLock lock(mutex_);
307 
308     // TODO(alph): Sometimes RecordAlloc is called twice in a row without
309     // a RecordFree in between. Investigate it.
310     if (sampled_addresses_set().Contains(address)) {
311       return;
312     }
313     sampled_addresses_set().Insert(address);
314     BalanceAddressesHashSet();
315     observers_copy = observers_;
316   }
317 
318   size_t total_allocated = mean_interval * samples;
319   for (base::PoissonAllocationSampler::SamplesObserver* observer :
320        observers_copy) {
321     observer->SampleAdded(address, size, total_allocated, type, context);
322   }
323 }
324 
DoRecordFree(void * address)325 void PoissonAllocationSampler::DoRecordFree(void* address) {
326   // There is a rare case on macOS and Android when the very first thread_local
327   // access in ScopedMuteThreadSamples constructor may allocate and
328   // thus reenter DoRecordAlloc. However the call chain won't build up further
329   // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
330   ScopedMuteThreadSamples no_reentrancy_scope;
331   std::vector<SamplesObserver*> observers_copy;
332   {
333     AutoLock lock(mutex_);
334     observers_copy = observers_;
335     sampled_addresses_set().Remove(address);
336   }
337   for (base::PoissonAllocationSampler::SamplesObserver* observer :
338        observers_copy) {
339     observer->SampleRemoved(address);
340   }
341 }
342 
BalanceAddressesHashSet()343 void PoissonAllocationSampler::BalanceAddressesHashSet() {
344   // Check if the load_factor of the current addresses hash set becomes higher
345   // than 1, allocate a new twice larger one, copy all the data,
346   // and switch to using it.
347   // During the copy process no other writes are made to both sets
348   // as it's behind the lock.
349   // All the readers continue to use the old one until the atomic switch
350   // process takes place.
351   LockFreeAddressHashSet& current_set = sampled_addresses_set();
352   if (current_set.load_factor() < 1) {
353     return;
354   }
355   auto new_set =
356       std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
357   new_set->Copy(current_set);
358   // Atomically switch all the new readers to the new set.
359   g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
360   // We leak the older set because we still have to keep all the old maps alive
361   // as there might be reader threads that have already obtained the map,
362   // but haven't yet managed to access it.
363 }
364 
365 // static
sampled_addresses_set()366 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
367   return *g_sampled_addresses_set.load(std::memory_order_acquire);
368 }
369 
370 // static
Get()371 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
372   static NoDestructor<PoissonAllocationSampler> instance;
373   return instance.get();
374 }
375 
376 // static
SetProfilingStateFlag(ProfilingStateFlag flag)377 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
378   ProfilingStateFlagMask flags = flag;
379   if (flag == ProfilingStateFlag::kIsRunning) {
380     flags |= ProfilingStateFlag::kWasStarted;
381   }
382   ProfilingStateFlagMask old_state =
383       profiling_state_.fetch_or(flags, std::memory_order_relaxed);
384   DCHECK(!(old_state & flag));
385 }
386 
387 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)388 void PoissonAllocationSampler::ResetProfilingStateFlag(
389     ProfilingStateFlag flag) {
390   DCHECK_NE(flag, kWasStarted);
391   ProfilingStateFlagMask old_state =
392       profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
393   DCHECK(old_state & flag);
394 }
395 
AddSamplesObserver(SamplesObserver * observer)396 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
397   // The following implementation (including ScopedMuteThreadSamples) will use
398   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
399   // disable the sampling by having a ReentryGuard.
400   ReentryGuard guard;
401 
402   ScopedMuteThreadSamples no_reentrancy_scope;
403   AutoLock lock(mutex_);
404   DCHECK(ranges::find(observers_, observer) == observers_.end());
405   bool profiler_was_stopped = observers_.empty();
406   observers_.push_back(observer);
407 
408   // Adding the observer will enable profiling. This will use
409   // `g_sampled_address_set` so it had better be initialized.
410   DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
411 
412   // Start the profiler if this was the first observer. Setting/resetting
413   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
414   // while holding `mutex_`.
415   if (profiler_was_stopped) {
416     SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
417   }
418   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
419          ProfilingStateFlag::kIsRunning);
420 }
421 
RemoveSamplesObserver(SamplesObserver * observer)422 void PoissonAllocationSampler::RemoveSamplesObserver(
423     SamplesObserver* observer) {
424   // The following implementation (including ScopedMuteThreadSamples) will use
425   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
426   // disable the sampling by having a ReentryGuard.
427   ReentryGuard guard;
428 
429   ScopedMuteThreadSamples no_reentrancy_scope;
430   AutoLock lock(mutex_);
431   auto it = ranges::find(observers_, observer);
432   CHECK(it != observers_.end(), base::NotFatalUntil::M125);
433   observers_.erase(it);
434 
435   // Stop the profiler if there are no more observers. Setting/resetting
436   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
437   // while holding `mutex_`.
438   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
439          ProfilingStateFlag::kIsRunning);
440   if (observers_.empty()) {
441     ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
442   }
443 }
444 
445 }  // namespace base
446