• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6 
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11 
12 #include "base/allocator/buildflags.h"
13 #include "base/allocator/dispatcher/reentry_guard.h"
14 #include "base/allocator/dispatcher/tls.h"
15 #include "base/check.h"
16 #include "base/compiler_specific.h"
17 #include "base/no_destructor.h"
18 #include "base/rand_util.h"
19 #include "base/ranges/algorithm.h"
20 #include "build/build_config.h"
21 #include "third_party/abseil-cpp/absl/base/attributes.h"
22 
23 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
24 #include "base/allocator/dispatcher/standard_hooks.h"
25 #endif
26 
27 namespace base {
28 
29 namespace {
30 
31 using ::base::allocator::dispatcher::ReentryGuard;
32 
33 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
34 
35 const intptr_t kAccumulatedBytesOffset = 1 << 29;
36 
37 // Controls if sample intervals should not be randomized. Used for testing.
38 bool g_deterministic = false;
39 
40 // Pointer to the current |LockFreeAddressHashSet|.
41 std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{nullptr};
42 
43 // Sampling interval parameter, the mean value for intervals between samples.
44 std::atomic_size_t g_sampling_interval{kDefaultSamplingIntervalBytes};
45 
46 struct ThreadLocalData {
47   // Accumulated bytes towards sample.
48   intptr_t accumulated_bytes = 0;
49   // Used as a workaround to avoid bias from muted samples. See
50   // ScopedMuteThreadSamples for more details.
51   intptr_t accumulated_bytes_snapshot = 0;
52   // PoissonAllocationSampler performs allocations while handling a
53   // notification. The guard protects against recursions originating from these.
54   bool internal_reentry_guard = false;
55   // A boolean used to distinguish first allocation on a thread:
56   //   false - first allocation on the thread;
57   //   true  - otherwise.
58   // Since accumulated_bytes is initialized with zero the very first
59   // allocation on a thread would always trigger the sample, thus skewing the
60   // profile towards such allocations. To mitigate that we use the flag to
61   // ensure the first allocation is properly accounted.
62   bool sampling_interval_initialized = false;
63 };
64 
GetThreadLocalData()65 ThreadLocalData* GetThreadLocalData() {
66 #if USE_LOCAL_TLS_EMULATION()
67   // If available, use ThreadLocalStorage to bypass dependencies introduced by
68   // Clang's implementation of thread_local.
69   static base::NoDestructor<
70       base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
71       thread_local_data;
72   return thread_local_data->GetThreadLocalData();
73 #else
74   // Notes on TLS usage:
75   //
76   // * There's no safe way to use TLS in malloc() as both C++ thread_local and
77   //   pthread do not pose any guarantees on whether they allocate or not.
78   // * We think that we can safely use thread_local w/o re-entrancy guard
79   //   because the compiler will use "tls static access model" for static builds
80   //   of Chrome [https://www.uclibc.org/docs/tls.pdf].
81   //   But there's no guarantee that this will stay true, and in practice
82   //   it seems to have problems on macOS/Android. These platforms do allocate
83   //   on the very first access to a thread_local on each thread.
84   // * Directly using/warming-up platform TLS seems to work on all platforms,
85   //   but is also not guaranteed to stay true. We make use of it for reentrancy
86   //   guards on macOS/Android.
87   // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
88   //   GetLastError.
89   //
90   // Android thread_local seems to be using __emutls_get_address from libgcc:
91   // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
92   // macOS version is based on _tlv_get_addr from dyld:
93   // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
94   thread_local ThreadLocalData thread_local_data;
95   return &thread_local_data;
96 #endif
97 }
98 
99 }  // namespace
100 
ScopedMuteThreadSamples()101 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
102   ThreadLocalData* const thread_local_data = GetThreadLocalData();
103 
104   DCHECK(!thread_local_data->internal_reentry_guard);
105   thread_local_data->internal_reentry_guard = true;
106 
107   // We mute thread samples immediately after taking a sample, which is when we
108   // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
109   // of the poisson process, and causes us to systematically overcount all other
110   // allocations. That's because muted allocations rarely trigger a sample
111   // [which would cause them to be ignored] since they occur right after
112   // g_tls_accumulated_bytes is reset.
113   //
114   // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
115   // amount to lower the probability that a sample is taken to close to 0. Then
116   // we reset it after we're done muting thread samples.
117   thread_local_data->accumulated_bytes_snapshot =
118       thread_local_data->accumulated_bytes;
119   thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
120 }
121 
~ScopedMuteThreadSamples()122 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
123   ThreadLocalData* const thread_local_data = GetThreadLocalData();
124   DCHECK(thread_local_data->internal_reentry_guard);
125   thread_local_data->internal_reentry_guard = false;
126   thread_local_data->accumulated_bytes =
127       thread_local_data->accumulated_bytes_snapshot;
128 }
129 
130 // static
IsMuted()131 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
132   ThreadLocalData* const thread_local_data = GetThreadLocalData();
133   return thread_local_data->internal_reentry_guard;
134 }
135 
136 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()137     ScopedSuppressRandomnessForTesting() {
138   DCHECK(!g_deterministic);
139   g_deterministic = true;
140   // The accumulated_bytes may contain a random value from previous
141   // test runs, which would make the behaviour of the next call to
142   // RecordAlloc unpredictable.
143   ThreadLocalData* const thread_local_data = GetThreadLocalData();
144   thread_local_data->accumulated_bytes = 0;
145 }
146 
147 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()148     ~ScopedSuppressRandomnessForTesting() {
149   DCHECK(g_deterministic);
150   g_deterministic = false;
151 }
152 
153 // static
154 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()155     IsSuppressed() {
156   return g_deterministic;
157 }
158 
159 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()160     ScopedMuteHookedSamplesForTesting() {
161   SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
162 
163   // Reset the accumulated bytes to 0 on this thread.
164   ThreadLocalData* const thread_local_data = GetThreadLocalData();
165   accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
166   thread_local_data->accumulated_bytes = 0;
167 }
168 
169 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()170     ~ScopedMuteHookedSamplesForTesting() {
171   // Restore the accumulated bytes.
172   ThreadLocalData* const thread_local_data = GetThreadLocalData();
173   thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
174   ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
175 }
176 
177 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
178 // static
179 PoissonAllocationSampler* PoissonAllocationSampler::instance_ = nullptr;
180 #endif
181 
182 // static
183 ABSL_CONST_INIT std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
184     PoissonAllocationSampler::profiling_state_{0};
185 
PoissonAllocationSampler()186 PoissonAllocationSampler::PoissonAllocationSampler() {
187 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
188   CHECK_EQ(nullptr, instance_);
189   instance_ = this;
190 #endif
191 
192   Init();
193   auto* sampled_addresses = new LockFreeAddressHashSet(64);
194   g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
195 
196 #if !BUILDFLAG(USE_ALLOCATION_EVENT_DISPATCHER)
197   // Install the allocator hooks immediately, to better match the behaviour
198   // of base::allocator::Initializer.
199   //
200   // TODO(crbug/1137393): Use base::allocator::Initializer to install the
201   // PoissonAllocationSampler hooks. All observers need to be passed to the
202   // initializer at the same time so this will install the hooks even
203   // earlier in process startup.
204   allocator::dispatcher::InstallStandardAllocatorHooks();
205 #endif
206 }
207 
208 // static
Init()209 void PoissonAllocationSampler::Init() {
210   [[maybe_unused]] static bool init_once = []() {
211     // Touch thread local data on initialization to enforce proper setup of
212     // underlying storage system.
213     GetThreadLocalData();
214     ReentryGuard::InitTLSSlot();
215     return true;
216   }();
217 }
218 
SetSamplingInterval(size_t sampling_interval_bytes)219 void PoissonAllocationSampler::SetSamplingInterval(
220     size_t sampling_interval_bytes) {
221   // TODO(alph): Reset the sample being collected if running.
222   g_sampling_interval = sampling_interval_bytes;
223 }
224 
SamplingInterval() const225 size_t PoissonAllocationSampler::SamplingInterval() const {
226   return g_sampling_interval.load(std::memory_order_relaxed);
227 }
228 
229 // static
GetNextSampleInterval(size_t interval)230 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
231   if (UNLIKELY(g_deterministic)) {
232     return interval;
233   }
234 
235   // We sample with a Poisson process, with constant average sampling
236   // interval. This follows the exponential probability distribution with
237   // parameter λ = 1/interval where |interval| is the average number of bytes
238   // between samples.
239   // Let u be a uniformly distributed random number (0,1], then
240   // next_sample = -ln(u) / λ
241   // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
242   // avoid a possible floating point exception from taking the log of 0.
243   // The allocator shim uses the PoissonAllocationSampler, hence avoid
244   // allocation to avoid infinite recursion.
245   double uniform = internal::RandDoubleAvoidAllocation();
246   double value = -log(1 - uniform) * interval;
247   size_t min_value = sizeof(intptr_t);
248   // We limit the upper bound of a sample interval to make sure we don't have
249   // huge gaps in the sampling stream. Probability of the upper bound gets hit
250   // is exp(-20) ~ 2e-9, so it should not skew the distribution.
251   size_t max_value = interval * 20;
252   if (UNLIKELY(value < min_value)) {
253     return min_value;
254   }
255   if (UNLIKELY(value > max_value)) {
256     return max_value;
257   }
258   return static_cast<size_t>(value);
259 }
260 
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)261 void PoissonAllocationSampler::DoRecordAllocation(
262     const ProfilingStateFlagMask state,
263     void* address,
264     size_t size,
265     base::allocator::dispatcher::AllocationSubsystem type,
266     const char* context) {
267   ThreadLocalData* const thread_local_data = GetThreadLocalData();
268 
269   thread_local_data->accumulated_bytes += size;
270   intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
271   if (LIKELY(accumulated_bytes < 0)) {
272     return;
273   }
274 
275   if (UNLIKELY(!(state & ProfilingStateFlag::kIsRunning))) {
276     // Sampling was in fact disabled when the hook was called. Reset the state
277     // of the sampler. We do this check off the fast-path, because it's quite a
278     // rare state when the sampler is stopped after it's started. (The most
279     // common caller of PoissonAllocationSampler starts it and leaves it running
280     // for the rest of the Chrome session.)
281     thread_local_data->sampling_interval_initialized = false;
282     thread_local_data->accumulated_bytes = 0;
283     return;
284   }
285 
286   // Failed allocation? Skip the sample.
287   if (UNLIKELY(!address)) {
288     return;
289   }
290 
291   size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
292   if (UNLIKELY(!thread_local_data->sampling_interval_initialized)) {
293     thread_local_data->sampling_interval_initialized = true;
294     // This is the very first allocation on the thread. It always makes it
295     // passing the condition at |RecordAlloc|, because accumulated_bytes
296     // is initialized with zero due to TLS semantics.
297     // Generate proper sampling interval instance and make sure the allocation
298     // has indeed crossed the threshold before counting it as a sample.
299     accumulated_bytes -= GetNextSampleInterval(mean_interval);
300     if (accumulated_bytes < 0) {
301       thread_local_data->accumulated_bytes = accumulated_bytes;
302       return;
303     }
304   }
305 
306   // This cast is safe because this function is only called with a positive
307   // value of `accumulated_bytes`.
308   size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
309   accumulated_bytes %= mean_interval;
310 
311   do {
312     accumulated_bytes -= GetNextSampleInterval(mean_interval);
313     ++samples;
314   } while (accumulated_bytes >= 0);
315 
316   thread_local_data->accumulated_bytes = accumulated_bytes;
317 
318   if (UNLIKELY(ScopedMuteThreadSamples::IsMuted())) {
319     return;
320   }
321 
322   ScopedMuteThreadSamples no_reentrancy_scope;
323   std::vector<SamplesObserver*> observers_copy;
324   {
325     AutoLock lock(mutex_);
326 
327     // TODO(alph): Sometimes RecordAlloc is called twice in a row without
328     // a RecordFree in between. Investigate it.
329     if (sampled_addresses_set().Contains(address)) {
330       return;
331     }
332     sampled_addresses_set().Insert(address);
333     BalanceAddressesHashSet();
334     observers_copy = observers_;
335   }
336 
337   size_t total_allocated = mean_interval * samples;
338   for (auto* observer : observers_copy) {
339     observer->SampleAdded(address, size, total_allocated, type, context);
340   }
341 }
342 
DoRecordFree(void * address)343 void PoissonAllocationSampler::DoRecordFree(void* address) {
344   // There is a rare case on macOS and Android when the very first thread_local
345   // access in ScopedMuteThreadSamples constructor may allocate and
346   // thus reenter DoRecordAlloc. However the call chain won't build up further
347   // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
348   ScopedMuteThreadSamples no_reentrancy_scope;
349   std::vector<SamplesObserver*> observers_copy;
350   {
351     AutoLock lock(mutex_);
352     observers_copy = observers_;
353     sampled_addresses_set().Remove(address);
354   }
355   for (auto* observer : observers_copy) {
356     observer->SampleRemoved(address);
357   }
358 }
359 
BalanceAddressesHashSet()360 void PoissonAllocationSampler::BalanceAddressesHashSet() {
361   // Check if the load_factor of the current addresses hash set becomes higher
362   // than 1, allocate a new twice larger one, copy all the data,
363   // and switch to using it.
364   // During the copy process no other writes are made to both sets
365   // as it's behind the lock.
366   // All the readers continue to use the old one until the atomic switch
367   // process takes place.
368   LockFreeAddressHashSet& current_set = sampled_addresses_set();
369   if (current_set.load_factor() < 1) {
370     return;
371   }
372   auto new_set =
373       std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
374   new_set->Copy(current_set);
375   // Atomically switch all the new readers to the new set.
376   g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
377   // We leak the older set because we still have to keep all the old maps alive
378   // as there might be reader threads that have already obtained the map,
379   // but haven't yet managed to access it.
380 }
381 
382 // static
sampled_addresses_set()383 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
384   return *g_sampled_addresses_set.load(std::memory_order_acquire);
385 }
386 
387 // static
Get()388 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
389   static NoDestructor<PoissonAllocationSampler> instance;
390   return instance.get();
391 }
392 
393 // static
SetProfilingStateFlag(ProfilingStateFlag flag)394 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
395   ProfilingStateFlagMask flags = flag;
396   if (flag == ProfilingStateFlag::kIsRunning) {
397     flags |= ProfilingStateFlag::kWasStarted;
398   }
399   ProfilingStateFlagMask old_state =
400       profiling_state_.fetch_or(flags, std::memory_order_relaxed);
401   DCHECK(!(old_state & flag));
402 }
403 
404 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)405 void PoissonAllocationSampler::ResetProfilingStateFlag(
406     ProfilingStateFlag flag) {
407   DCHECK_NE(flag, kWasStarted);
408   ProfilingStateFlagMask old_state =
409       profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
410   DCHECK(old_state & flag);
411 }
412 
AddSamplesObserver(SamplesObserver * observer)413 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
414   // The following implementation (including ScopedMuteThreadSamples) will use
415   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
416   // disable the sampling by having a ReentryGuard.
417   ReentryGuard guard;
418 
419   ScopedMuteThreadSamples no_reentrancy_scope;
420   AutoLock lock(mutex_);
421   DCHECK(ranges::find(observers_, observer) == observers_.end());
422   bool profiler_was_stopped = observers_.empty();
423   observers_.push_back(observer);
424 
425   // Adding the observer will enable profiling. This will use
426   // `g_sampled_address_set` so it had better be initialized.
427   DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
428 
429   // Start the profiler if this was the first observer. Setting/resetting
430   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
431   // while holding `mutex_`.
432   if (profiler_was_stopped) {
433     SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
434   }
435   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
436          ProfilingStateFlag::kIsRunning);
437 }
438 
RemoveSamplesObserver(SamplesObserver * observer)439 void PoissonAllocationSampler::RemoveSamplesObserver(
440     SamplesObserver* observer) {
441   // The following implementation (including ScopedMuteThreadSamples) will use
442   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
443   // disable the sampling by having a ReentryGuard.
444   ReentryGuard guard;
445 
446   ScopedMuteThreadSamples no_reentrancy_scope;
447   AutoLock lock(mutex_);
448   auto it = ranges::find(observers_, observer);
449   DCHECK(it != observers_.end());
450   observers_.erase(it);
451 
452   // Stop the profiler if there are no more observers. Setting/resetting
453   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
454   // while holding `mutex_`.
455   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
456          ProfilingStateFlag::kIsRunning);
457   if (observers_.empty()) {
458     ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
459   }
460 }
461 
462 }  // namespace base
463