• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6 
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11 
12 #include "base/allocator/dispatcher/reentry_guard.h"
13 #include "base/allocator/dispatcher/tls.h"
14 #include "base/check.h"
15 #include "base/compiler_specific.h"
16 #include "base/no_destructor.h"
17 #include "base/rand_util.h"
18 #include "base/ranges/algorithm.h"
19 #include "build/build_config.h"
20 #include "third_party/abseil-cpp/absl/base/attributes.h"
21 
22 namespace base {
23 
24 namespace {
25 
26 using ::base::allocator::dispatcher::ReentryGuard;
27 
28 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
29 
30 const intptr_t kAccumulatedBytesOffset = 1 << 29;
31 
32 // Controls if sample intervals should not be randomized. Used for testing.
33 bool g_deterministic = false;
34 
35 // Pointer to the current |LockFreeAddressHashSet|.
36 ABSL_CONST_INIT std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{
37     nullptr};
38 
39 // Sampling interval parameter, the mean value for intervals between samples.
40 ABSL_CONST_INIT std::atomic_size_t g_sampling_interval{
41     kDefaultSamplingIntervalBytes};
42 
43 struct ThreadLocalData {
44   // Accumulated bytes towards sample.
45   intptr_t accumulated_bytes = 0;
46   // Used as a workaround to avoid bias from muted samples. See
47   // ScopedMuteThreadSamples for more details.
48   intptr_t accumulated_bytes_snapshot = 0;
49   // PoissonAllocationSampler performs allocations while handling a
50   // notification. The guard protects against recursions originating from these.
51   bool internal_reentry_guard = false;
52   // A boolean used to distinguish first allocation on a thread:
53   //   false - first allocation on the thread;
54   //   true  - otherwise.
55   // Since accumulated_bytes is initialized with zero the very first
56   // allocation on a thread would always trigger the sample, thus skewing the
57   // profile towards such allocations. To mitigate that we use the flag to
58   // ensure the first allocation is properly accounted.
59   bool sampling_interval_initialized = false;
60 };
61 
GetThreadLocalData()62 ThreadLocalData* GetThreadLocalData() {
63 #if USE_LOCAL_TLS_EMULATION()
64   // If available, use ThreadLocalStorage to bypass dependencies introduced by
65   // Clang's implementation of thread_local.
66   static base::NoDestructor<
67       base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
68       thread_local_data("poisson_allocation_sampler");
69   return thread_local_data->GetThreadLocalData();
70 #else
71   // Notes on TLS usage:
72   //
73   // * There's no safe way to use TLS in malloc() as both C++ thread_local and
74   //   pthread do not pose any guarantees on whether they allocate or not.
75   // * We think that we can safely use thread_local w/o re-entrancy guard
76   //   because the compiler will use "tls static access model" for static builds
77   //   of Chrome [https://www.uclibc.org/docs/tls.pdf].
78   //   But there's no guarantee that this will stay true, and in practice
79   //   it seems to have problems on macOS/Android. These platforms do allocate
80   //   on the very first access to a thread_local on each thread.
81   // * Directly using/warming-up platform TLS seems to work on all platforms,
82   //   but is also not guaranteed to stay true. We make use of it for reentrancy
83   //   guards on macOS/Android.
84   // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
85   //   GetLastError.
86   //
87   // Android thread_local seems to be using __emutls_get_address from libgcc:
88   // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
89   // macOS version is based on _tlv_get_addr from dyld:
90   // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
91   thread_local ThreadLocalData thread_local_data;
92   return &thread_local_data;
93 #endif
94 }
95 
96 }  // namespace
97 
ScopedMuteThreadSamples()98 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
99   ThreadLocalData* const thread_local_data = GetThreadLocalData();
100 
101   DCHECK(!thread_local_data->internal_reentry_guard);
102   thread_local_data->internal_reentry_guard = true;
103 
104   // We mute thread samples immediately after taking a sample, which is when we
105   // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
106   // of the poisson process, and causes us to systematically overcount all other
107   // allocations. That's because muted allocations rarely trigger a sample
108   // [which would cause them to be ignored] since they occur right after
109   // g_tls_accumulated_bytes is reset.
110   //
111   // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
112   // amount to lower the probability that a sample is taken to close to 0. Then
113   // we reset it after we're done muting thread samples.
114   thread_local_data->accumulated_bytes_snapshot =
115       thread_local_data->accumulated_bytes;
116   thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
117 }
118 
~ScopedMuteThreadSamples()119 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
120   ThreadLocalData* const thread_local_data = GetThreadLocalData();
121   DCHECK(thread_local_data->internal_reentry_guard);
122   thread_local_data->internal_reentry_guard = false;
123   thread_local_data->accumulated_bytes =
124       thread_local_data->accumulated_bytes_snapshot;
125 }
126 
127 // static
IsMuted()128 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
129   ThreadLocalData* const thread_local_data = GetThreadLocalData();
130   return thread_local_data->internal_reentry_guard;
131 }
132 
133 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()134     ScopedSuppressRandomnessForTesting() {
135   DCHECK(!g_deterministic);
136   g_deterministic = true;
137   // The accumulated_bytes may contain a random value from previous
138   // test runs, which would make the behaviour of the next call to
139   // RecordAlloc unpredictable.
140   ThreadLocalData* const thread_local_data = GetThreadLocalData();
141   thread_local_data->accumulated_bytes = 0;
142 }
143 
144 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()145     ~ScopedSuppressRandomnessForTesting() {
146   DCHECK(g_deterministic);
147   g_deterministic = false;
148 }
149 
150 // static
151 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()152     IsSuppressed() {
153   return g_deterministic;
154 }
155 
156 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()157     ScopedMuteHookedSamplesForTesting() {
158   SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
159 
160   // Reset the accumulated bytes to 0 on this thread.
161   ThreadLocalData* const thread_local_data = GetThreadLocalData();
162   accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
163   thread_local_data->accumulated_bytes = 0;
164 }
165 
166 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()167     ~ScopedMuteHookedSamplesForTesting() {
168   // Restore the accumulated bytes.
169   ThreadLocalData* const thread_local_data = GetThreadLocalData();
170   thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
171   ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
172 }
173 
174 // static
175 ABSL_CONST_INIT std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
176     PoissonAllocationSampler::profiling_state_{0};
177 
PoissonAllocationSampler()178 PoissonAllocationSampler::PoissonAllocationSampler() {
179   Init();
180   auto* sampled_addresses = new LockFreeAddressHashSet(64);
181   g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
182 }
183 
184 // static
Init()185 void PoissonAllocationSampler::Init() {
186   [[maybe_unused]] static bool init_once = []() {
187     // Touch thread local data on initialization to enforce proper setup of
188     // underlying storage system.
189     GetThreadLocalData();
190     ReentryGuard::InitTLSSlot();
191     return true;
192   }();
193 }
194 
SetSamplingInterval(size_t sampling_interval_bytes)195 void PoissonAllocationSampler::SetSamplingInterval(
196     size_t sampling_interval_bytes) {
197   // TODO(alph): Reset the sample being collected if running.
198   g_sampling_interval.store(sampling_interval_bytes, std::memory_order_relaxed);
199 }
200 
SamplingInterval() const201 size_t PoissonAllocationSampler::SamplingInterval() const {
202   return g_sampling_interval.load(std::memory_order_relaxed);
203 }
204 
205 // static
GetNextSampleInterval(size_t interval)206 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
207   if (UNLIKELY(g_deterministic)) {
208     return interval;
209   }
210 
211   // We sample with a Poisson process, with constant average sampling
212   // interval. This follows the exponential probability distribution with
213   // parameter λ = 1/interval where |interval| is the average number of bytes
214   // between samples.
215   // Let u be a uniformly distributed random number (0,1], then
216   // next_sample = -ln(u) / λ
217   // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
218   // avoid a possible floating point exception from taking the log of 0.
219   // The allocator shim uses the PoissonAllocationSampler, hence avoid
220   // allocation to avoid infinite recursion.
221   double uniform = internal::RandDoubleAvoidAllocation();
222   double value = -log(1 - uniform) * interval;
223   size_t min_value = sizeof(intptr_t);
224   // We limit the upper bound of a sample interval to make sure we don't have
225   // huge gaps in the sampling stream. Probability of the upper bound gets hit
226   // is exp(-20) ~ 2e-9, so it should not skew the distribution.
227   size_t max_value = interval * 20;
228   if (UNLIKELY(value < min_value)) {
229     return min_value;
230   }
231   if (UNLIKELY(value > max_value)) {
232     return max_value;
233   }
234   return static_cast<size_t>(value);
235 }
236 
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)237 void PoissonAllocationSampler::DoRecordAllocation(
238     const ProfilingStateFlagMask state,
239     void* address,
240     size_t size,
241     base::allocator::dispatcher::AllocationSubsystem type,
242     const char* context) {
243   ThreadLocalData* const thread_local_data = GetThreadLocalData();
244 
245   thread_local_data->accumulated_bytes += size;
246   intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
247   if (LIKELY(accumulated_bytes < 0)) {
248     return;
249   }
250 
251   if (UNLIKELY(!(state & ProfilingStateFlag::kIsRunning))) {
252     // Sampling was in fact disabled when the hook was called. Reset the state
253     // of the sampler. We do this check off the fast-path, because it's quite a
254     // rare state when the sampler is stopped after it's started. (The most
255     // common caller of PoissonAllocationSampler starts it and leaves it running
256     // for the rest of the Chrome session.)
257     thread_local_data->sampling_interval_initialized = false;
258     thread_local_data->accumulated_bytes = 0;
259     return;
260   }
261 
262   // Failed allocation? Skip the sample.
263   if (UNLIKELY(!address)) {
264     return;
265   }
266 
267   size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
268   if (UNLIKELY(!thread_local_data->sampling_interval_initialized)) {
269     thread_local_data->sampling_interval_initialized = true;
270     // This is the very first allocation on the thread. It always makes it
271     // passing the condition at |RecordAlloc|, because accumulated_bytes
272     // is initialized with zero due to TLS semantics.
273     // Generate proper sampling interval instance and make sure the allocation
274     // has indeed crossed the threshold before counting it as a sample.
275     accumulated_bytes -= GetNextSampleInterval(mean_interval);
276     if (accumulated_bytes < 0) {
277       thread_local_data->accumulated_bytes = accumulated_bytes;
278       return;
279     }
280   }
281 
282   // This cast is safe because this function is only called with a positive
283   // value of `accumulated_bytes`.
284   size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
285   accumulated_bytes %= mean_interval;
286 
287   do {
288     accumulated_bytes -= GetNextSampleInterval(mean_interval);
289     ++samples;
290   } while (accumulated_bytes >= 0);
291 
292   thread_local_data->accumulated_bytes = accumulated_bytes;
293 
294   if (UNLIKELY(ScopedMuteThreadSamples::IsMuted())) {
295     return;
296   }
297 
298   ScopedMuteThreadSamples no_reentrancy_scope;
299   std::vector<SamplesObserver*> observers_copy;
300   {
301     AutoLock lock(mutex_);
302 
303     // TODO(alph): Sometimes RecordAlloc is called twice in a row without
304     // a RecordFree in between. Investigate it.
305     if (sampled_addresses_set().Contains(address)) {
306       return;
307     }
308     sampled_addresses_set().Insert(address);
309     BalanceAddressesHashSet();
310     observers_copy = observers_;
311   }
312 
313   size_t total_allocated = mean_interval * samples;
314   for (auto* observer : observers_copy) {
315     observer->SampleAdded(address, size, total_allocated, type, context);
316   }
317 }
318 
DoRecordFree(void * address)319 void PoissonAllocationSampler::DoRecordFree(void* address) {
320   // There is a rare case on macOS and Android when the very first thread_local
321   // access in ScopedMuteThreadSamples constructor may allocate and
322   // thus reenter DoRecordAlloc. However the call chain won't build up further
323   // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
324   ScopedMuteThreadSamples no_reentrancy_scope;
325   std::vector<SamplesObserver*> observers_copy;
326   {
327     AutoLock lock(mutex_);
328     observers_copy = observers_;
329     sampled_addresses_set().Remove(address);
330   }
331   for (auto* observer : observers_copy) {
332     observer->SampleRemoved(address);
333   }
334 }
335 
BalanceAddressesHashSet()336 void PoissonAllocationSampler::BalanceAddressesHashSet() {
337   // Check if the load_factor of the current addresses hash set becomes higher
338   // than 1, allocate a new twice larger one, copy all the data,
339   // and switch to using it.
340   // During the copy process no other writes are made to both sets
341   // as it's behind the lock.
342   // All the readers continue to use the old one until the atomic switch
343   // process takes place.
344   LockFreeAddressHashSet& current_set = sampled_addresses_set();
345   if (current_set.load_factor() < 1) {
346     return;
347   }
348   auto new_set =
349       std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
350   new_set->Copy(current_set);
351   // Atomically switch all the new readers to the new set.
352   g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
353   // We leak the older set because we still have to keep all the old maps alive
354   // as there might be reader threads that have already obtained the map,
355   // but haven't yet managed to access it.
356 }
357 
358 // static
sampled_addresses_set()359 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
360   return *g_sampled_addresses_set.load(std::memory_order_acquire);
361 }
362 
363 // static
Get()364 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
365   static NoDestructor<PoissonAllocationSampler> instance;
366   return instance.get();
367 }
368 
369 // static
SetProfilingStateFlag(ProfilingStateFlag flag)370 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
371   ProfilingStateFlagMask flags = flag;
372   if (flag == ProfilingStateFlag::kIsRunning) {
373     flags |= ProfilingStateFlag::kWasStarted;
374   }
375   ProfilingStateFlagMask old_state =
376       profiling_state_.fetch_or(flags, std::memory_order_relaxed);
377   DCHECK(!(old_state & flag));
378 }
379 
380 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)381 void PoissonAllocationSampler::ResetProfilingStateFlag(
382     ProfilingStateFlag flag) {
383   DCHECK_NE(flag, kWasStarted);
384   ProfilingStateFlagMask old_state =
385       profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
386   DCHECK(old_state & flag);
387 }
388 
AddSamplesObserver(SamplesObserver * observer)389 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
390   // The following implementation (including ScopedMuteThreadSamples) will use
391   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
392   // disable the sampling by having a ReentryGuard.
393   ReentryGuard guard;
394 
395   ScopedMuteThreadSamples no_reentrancy_scope;
396   AutoLock lock(mutex_);
397   DCHECK(ranges::find(observers_, observer) == observers_.end());
398   bool profiler_was_stopped = observers_.empty();
399   observers_.push_back(observer);
400 
401   // Adding the observer will enable profiling. This will use
402   // `g_sampled_address_set` so it had better be initialized.
403   DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
404 
405   // Start the profiler if this was the first observer. Setting/resetting
406   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
407   // while holding `mutex_`.
408   if (profiler_was_stopped) {
409     SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
410   }
411   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
412          ProfilingStateFlag::kIsRunning);
413 }
414 
RemoveSamplesObserver(SamplesObserver * observer)415 void PoissonAllocationSampler::RemoveSamplesObserver(
416     SamplesObserver* observer) {
417   // The following implementation (including ScopedMuteThreadSamples) will use
418   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
419   // disable the sampling by having a ReentryGuard.
420   ReentryGuard guard;
421 
422   ScopedMuteThreadSamples no_reentrancy_scope;
423   AutoLock lock(mutex_);
424   auto it = ranges::find(observers_, observer);
425   DCHECK(it != observers_.end());
426   observers_.erase(it);
427 
428   // Stop the profiler if there are no more observers. Setting/resetting
429   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
430   // while holding `mutex_`.
431   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
432          ProfilingStateFlag::kIsRunning);
433   if (observers_.empty()) {
434     ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
435   }
436 }
437 
438 }  // namespace base
439