• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/allocator/partition_allocator/thread_cache.h"
6 
7 #include <sys/types.h>
8 
9 #include <algorithm>
10 #include <atomic>
11 #include <cstdint>
12 
13 #include "base/allocator/partition_allocator/partition_alloc-inl.h"
14 #include "base/allocator/partition_allocator/partition_alloc_base/component_export.h"
15 #include "base/allocator/partition_allocator/partition_alloc_base/cxx17_backports.h"
16 #include "base/allocator/partition_allocator/partition_alloc_base/debug/debugging_buildflags.h"
17 #include "base/allocator/partition_allocator/partition_alloc_base/immediate_crash.h"
18 #include "base/allocator/partition_allocator/partition_alloc_buildflags.h"
19 #include "base/allocator/partition_allocator/partition_alloc_check.h"
20 #include "base/allocator/partition_allocator/partition_alloc_config.h"
21 #include "base/allocator/partition_allocator/partition_alloc_constants.h"
22 #include "base/allocator/partition_allocator/partition_root.h"
23 #include "build/build_config.h"
24 
25 namespace partition_alloc {
26 
27 namespace {
28 ThreadCacheRegistry g_instance;
29 }  // namespace
30 
31 namespace tools {
32 uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize] = {
33     kNeedle1, reinterpret_cast<uintptr_t>(&g_instance),
34 #if BUILDFLAG(RECORD_ALLOC_INFO)
35     reinterpret_cast<uintptr_t>(&internal::g_allocs),
36 #else
37     0,
38 #endif
39     kNeedle2};
40 }  // namespace tools
41 
42 namespace internal {
43 
44 PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
45 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
46 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
47 thread_local ThreadCache* g_thread_cache;
48 #endif
49 
50 }  // namespace internal
51 
52 namespace {
53 // Since |g_thread_cache_key| is shared, make sure that no more than one
54 // PartitionRoot can use it.
55 static std::atomic<PartitionRoot<>*> g_thread_cache_root;
56 
57 #if BUILDFLAG(IS_WIN)
OnDllProcessDetach()58 void OnDllProcessDetach() {
59   // Very late allocations do occur (see crbug.com/1159411#c7 for instance),
60   // including during CRT teardown. This is problematic for the thread cache
61   // which relies on the CRT for TLS access for instance. This cannot be
62   // mitigated inside the thread cache (since getting to it requires querying
63   // TLS), but the PartitionRoot associated wih the thread cache can be made to
64   // not use the thread cache anymore.
65   g_thread_cache_root.load(std::memory_order_relaxed)->flags.with_thread_cache =
66       false;
67 }
68 #endif
69 
70 static bool g_thread_cache_key_created = false;
71 }  // namespace
72 
73 constexpr internal::base::TimeDelta ThreadCacheRegistry::kMinPurgeInterval;
74 constexpr internal::base::TimeDelta ThreadCacheRegistry::kMaxPurgeInterval;
75 constexpr internal::base::TimeDelta ThreadCacheRegistry::kDefaultPurgeInterval;
76 constexpr size_t ThreadCacheRegistry::kMinCachedMemoryForPurging;
77 uint8_t ThreadCache::global_limits_[ThreadCache::kBucketCount];
78 
79 // Start with the normal size, not the maximum one.
80 uint16_t ThreadCache::largest_active_bucket_index_ =
81     internal::BucketIndexLookup::GetIndex(ThreadCache::kDefaultSizeThreshold);
82 
83 // static
Instance()84 ThreadCacheRegistry& ThreadCacheRegistry::Instance() {
85   return g_instance;
86 }
87 
RegisterThreadCache(ThreadCache * cache)88 void ThreadCacheRegistry::RegisterThreadCache(ThreadCache* cache) {
89   internal::ScopedGuard scoped_locker(GetLock());
90   cache->next_ = nullptr;
91   cache->prev_ = nullptr;
92 
93   ThreadCache* previous_head = list_head_;
94   list_head_ = cache;
95   cache->next_ = previous_head;
96   if (previous_head) {
97     previous_head->prev_ = cache;
98   }
99 }
100 
UnregisterThreadCache(ThreadCache * cache)101 void ThreadCacheRegistry::UnregisterThreadCache(ThreadCache* cache) {
102   internal::ScopedGuard scoped_locker(GetLock());
103   if (cache->prev_) {
104     cache->prev_->next_ = cache->next_;
105   }
106   if (cache->next_) {
107     cache->next_->prev_ = cache->prev_;
108   }
109   if (cache == list_head_) {
110     list_head_ = cache->next_;
111   }
112 }
113 
DumpStats(bool my_thread_only,ThreadCacheStats * stats)114 void ThreadCacheRegistry::DumpStats(bool my_thread_only,
115                                     ThreadCacheStats* stats) {
116   ThreadCache::EnsureThreadSpecificDataInitialized();
117   memset(reinterpret_cast<void*>(stats), 0, sizeof(ThreadCacheStats));
118 
119   internal::ScopedGuard scoped_locker(GetLock());
120   if (my_thread_only) {
121     auto* tcache = ThreadCache::Get();
122     if (!ThreadCache::IsValid(tcache)) {
123       return;
124     }
125     tcache->AccumulateStats(stats);
126   } else {
127     ThreadCache* tcache = list_head_;
128     while (tcache) {
129       // Racy, as other threads are still allocating. This is not an issue,
130       // since we are only interested in statistics. However, this means that
131       // count is not necessarily equal to hits + misses for the various types
132       // of events.
133       tcache->AccumulateStats(stats);
134       tcache = tcache->next_;
135     }
136   }
137 }
138 
PurgeAll()139 void ThreadCacheRegistry::PurgeAll() {
140   auto* current_thread_tcache = ThreadCache::Get();
141 
142   // May take a while, don't hold the lock while purging.
143   //
144   // In most cases, the current thread is more important than other ones. For
145   // instance in renderers, it is the main thread. It is also the only thread
146   // that we can synchronously purge.
147   //
148   // The reason why we trigger the purge for this one first is that assuming
149   // that all threads are allocating memory, they will start purging
150   // concurrently in the loop below. This will then make them all contend with
151   // the main thread for the partition lock, since it is acquired/released once
152   // per bucket. By purging the main thread first, we avoid these interferences
153   // for this thread at least.
154   if (ThreadCache::IsValid(current_thread_tcache)) {
155     current_thread_tcache->Purge();
156   }
157 
158   {
159     internal::ScopedGuard scoped_locker(GetLock());
160     ThreadCache* tcache = list_head_;
161     while (tcache) {
162       PA_DCHECK(ThreadCache::IsValid(tcache));
163       // Cannot purge directly, need to ask the other thread to purge "at some
164       // point".
165       // Note that this will not work if the other thread is sleeping forever.
166       // TODO(lizeb): Handle sleeping threads.
167       if (tcache != current_thread_tcache) {
168         tcache->SetShouldPurge();
169       }
170       tcache = tcache->next_;
171     }
172   }
173 }
174 
ForcePurgeAllThreadAfterForkUnsafe()175 void ThreadCacheRegistry::ForcePurgeAllThreadAfterForkUnsafe() {
176   internal::ScopedGuard scoped_locker(GetLock());
177   ThreadCache* tcache = list_head_;
178   while (tcache) {
179 #if BUILDFLAG(PA_DCHECK_IS_ON)
180     // Before fork(), locks are acquired in the parent process. This means that
181     // a concurrent allocation in the parent which must be filled by the central
182     // allocator (i.e. the thread cache bucket is empty) will block inside the
183     // thread cache waiting for the lock to be released.
184     //
185     // In the child process, this allocation will never complete since this
186     // thread will not be resumed. However, calling |Purge()| triggers the
187     // reentrancy guard since the parent process thread was suspended from
188     // within the thread cache.
189     // Clear the guard to prevent this from crashing.
190     tcache->is_in_thread_cache_ = false;
191 #endif
192     // There is a PA_DCHECK() in code called from |Purge()| checking that thread
193     // cache memory accounting is correct. Since we are after fork() and the
194     // other threads got interrupted mid-flight, this guarantee does not hold,
195     // and we get inconsistent results.  Rather than giving up on checking this
196     // invariant in regular code, reset it here so that the PA_DCHECK()
197     // passes. See crbug.com/1216964.
198     tcache->cached_memory_ = tcache->CachedMemory();
199 
200     // At this point, we should call |TryPurge|. However, due to the thread
201     // cache being possibly inconsistent at this point, this may crash. Rather
202     // than crash, we'd prefer to simply not purge, even though this may leak
203     // memory in some cases.
204     //
205     // see crbug.com/1289092 for details of the crashes.
206 
207     tcache = tcache->next_;
208   }
209 }
210 
SetLargestActiveBucketIndex(uint8_t largest_active_bucket_index)211 void ThreadCacheRegistry::SetLargestActiveBucketIndex(
212     uint8_t largest_active_bucket_index) {
213   largest_active_bucket_index_ = largest_active_bucket_index;
214 }
215 
SetThreadCacheMultiplier(float multiplier)216 void ThreadCacheRegistry::SetThreadCacheMultiplier(float multiplier) {
217   // Two steps:
218   // - Set the global limits, which will affect newly created threads.
219   // - Enumerate all thread caches and set the limit to the global one.
220   {
221     internal::ScopedGuard scoped_locker(GetLock());
222     ThreadCache* tcache = list_head_;
223 
224     // If this is called before *any* thread cache has serviced *any*
225     // allocation, which can happen in tests, and in theory in non-test code as
226     // well.
227     if (!tcache) {
228       return;
229     }
230 
231     // Setting the global limit while locked, because we need |tcache->root_|.
232     ThreadCache::SetGlobalLimits(tcache->root_, multiplier);
233 
234     while (tcache) {
235       PA_DCHECK(ThreadCache::IsValid(tcache));
236       for (int index = 0; index < ThreadCache::kBucketCount; index++) {
237         // This is racy, but we don't care if the limit is enforced later, and
238         // we really want to avoid atomic instructions on the fast path.
239         tcache->buckets_[index].limit.store(ThreadCache::global_limits_[index],
240                                             std::memory_order_relaxed);
241       }
242 
243       tcache = tcache->next_;
244     }
245   }
246 }
247 
RunPeriodicPurge()248 void ThreadCacheRegistry::RunPeriodicPurge() {
249   if (!periodic_purge_is_initialized_) {
250     ThreadCache::EnsureThreadSpecificDataInitialized();
251     periodic_purge_is_initialized_ = true;
252   }
253 
254   // Summing across all threads can be slow, but is necessary. Otherwise we rely
255   // on the assumption that the current thread is a good proxy for overall
256   // allocation activity. This is not the case for all process types.
257   //
258   // Since there is no synchronization with other threads, the value is stale,
259   // which is fine.
260   size_t cached_memory_approx = 0;
261   {
262     internal::ScopedGuard scoped_locker(GetLock());
263     ThreadCache* tcache = list_head_;
264     // Can run when there is no thread cache, in which case there is nothing to
265     // do, and the task should not be rescheduled. This would typically indicate
266     // a case where the thread cache was never enabled, or got disabled.
267     if (!tcache) {
268       return;
269     }
270 
271     while (tcache) {
272       cached_memory_approx += tcache->cached_memory_;
273       tcache = tcache->next_;
274     }
275   }
276 
277   // If cached memory is low, this means that either memory footprint is fine,
278   // or the process is mostly idle, and not allocating much since the last
279   // purge. In this case, back off. On the other hand, if there is a lot of
280   // cached memory, make purge more frequent, but always within a set frequency
281   // range.
282   //
283   // There is a potential drawback: a process that was idle for a long time and
284   // suddenly becomes very active will take some time to go back to regularly
285   // scheduled purge with a small enough interval. This is the case for instance
286   // of a renderer moving to foreground. To mitigate that, if cached memory
287   // jumps is very large, make a greater leap to faster purging.
288   if (cached_memory_approx > 10 * kMinCachedMemoryForPurging) {
289     periodic_purge_next_interval_ =
290         std::min(kDefaultPurgeInterval, periodic_purge_next_interval_ / 2);
291   } else if (cached_memory_approx > 2 * kMinCachedMemoryForPurging) {
292     periodic_purge_next_interval_ =
293         std::max(kMinPurgeInterval, periodic_purge_next_interval_ / 2);
294   } else if (cached_memory_approx < kMinCachedMemoryForPurging) {
295     periodic_purge_next_interval_ =
296         std::min(kMaxPurgeInterval, periodic_purge_next_interval_ * 2);
297   }
298 
299   // Make sure that the next interval is in the right bounds. Even though the
300   // logic above should eventually converge to a reasonable interval, if a
301   // sleeping background thread holds onto a large amount of cached memory, then
302   // |PurgeAll()| will not free any memory from it, and the first branch above
303   // can be taken repeatedly until the interval gets very small, as the amount
304   // of cached memory cannot change between calls (since we do not purge
305   // background threads, but only ask them to purge their own cache at the next
306   // allocation).
307   periodic_purge_next_interval_ = internal::base::clamp(
308       periodic_purge_next_interval_, kMinPurgeInterval, kMaxPurgeInterval);
309 
310   PurgeAll();
311 }
312 
GetPeriodicPurgeNextIntervalInMicroseconds() const313 int64_t ThreadCacheRegistry::GetPeriodicPurgeNextIntervalInMicroseconds()
314     const {
315   return periodic_purge_next_interval_.InMicroseconds();
316 }
317 
ResetForTesting()318 void ThreadCacheRegistry::ResetForTesting() {
319   periodic_purge_next_interval_ = kDefaultPurgeInterval;
320 }
321 
322 // static
EnsureThreadSpecificDataInitialized()323 void ThreadCache::EnsureThreadSpecificDataInitialized() {
324   // Using the registry lock to protect from concurrent initialization without
325   // adding a special-pupose lock.
326   internal::ScopedGuard scoped_locker(
327       ThreadCacheRegistry::Instance().GetLock());
328   if (g_thread_cache_key_created) {
329     return;
330   }
331 
332   bool ok = internal::PartitionTlsCreate(&internal::g_thread_cache_key, Delete);
333   PA_CHECK(ok);
334   g_thread_cache_key_created = true;
335 }
336 
337 // static
DeleteForTesting(ThreadCache * tcache)338 void ThreadCache::DeleteForTesting(ThreadCache* tcache) {
339   ThreadCache::Delete(tcache);
340 }
341 
342 // static
SwapForTesting(PartitionRoot<> * root)343 void ThreadCache::SwapForTesting(PartitionRoot<>* root) {
344   auto* old_tcache = ThreadCache::Get();
345   g_thread_cache_root.store(nullptr, std::memory_order_relaxed);
346   if (old_tcache) {
347     ThreadCache::DeleteForTesting(old_tcache);
348   }
349   if (root) {
350     Init(root);
351     Create(root);
352   } else {
353 #if BUILDFLAG(IS_WIN)
354     // OnDllProcessDetach accesses g_thread_cache_root which is nullptr now.
355     internal::PartitionTlsSetOnDllProcessDetach(nullptr);
356 #endif
357   }
358 }
359 
360 // static
RemoveTombstoneForTesting()361 void ThreadCache::RemoveTombstoneForTesting() {
362   PA_CHECK(IsTombstone(Get()));
363   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
364 }
365 
366 // static
Init(PartitionRoot<> * root)367 void ThreadCache::Init(PartitionRoot<>* root) {
368 #if BUILDFLAG(IS_NACL)
369   static_assert(false, "PartitionAlloc isn't supported for NaCl");
370 #endif
371   PA_CHECK(root->buckets[kBucketCount - 1].slot_size ==
372            ThreadCache::kLargeSizeThreshold);
373   PA_CHECK(root->buckets[largest_active_bucket_index_].slot_size ==
374            ThreadCache::kDefaultSizeThreshold);
375 
376   EnsureThreadSpecificDataInitialized();
377 
378   // Make sure that only one PartitionRoot wants a thread cache.
379   PartitionRoot<>* expected = nullptr;
380   if (!g_thread_cache_root.compare_exchange_strong(expected, root,
381                                                    std::memory_order_seq_cst,
382                                                    std::memory_order_seq_cst)) {
383     PA_CHECK(false)
384         << "Only one PartitionRoot is allowed to have a thread cache";
385   }
386 
387 #if BUILDFLAG(IS_WIN)
388   internal::PartitionTlsSetOnDllProcessDetach(OnDllProcessDetach);
389 #endif
390 
391   SetGlobalLimits(root, kDefaultMultiplier);
392 }
393 
394 // static
SetGlobalLimits(PartitionRoot<> * root,float multiplier)395 void ThreadCache::SetGlobalLimits(PartitionRoot<>* root, float multiplier) {
396   size_t initial_value =
397       static_cast<size_t>(kSmallBucketBaseCount) * multiplier;
398 
399   for (int index = 0; index < kBucketCount; index++) {
400     const auto& root_bucket = root->buckets[index];
401     // Invalid bucket.
402     if (!root_bucket.active_slot_spans_head) {
403       global_limits_[index] = 0;
404       continue;
405     }
406 
407     // Smaller allocations are more frequent, and more performance-sensitive.
408     // Cache more small objects, and fewer larger ones, to save memory.
409     size_t slot_size = root_bucket.slot_size;
410     size_t value;
411     if (slot_size <= 128) {
412       value = initial_value;
413     } else if (slot_size <= 256) {
414       value = initial_value / 2;
415     } else if (slot_size <= 512) {
416       value = initial_value / 4;
417     } else {
418       value = initial_value / 8;
419     }
420 
421     // Bare minimum so that malloc() / free() in a loop will not hit the central
422     // allocator each time.
423     constexpr size_t kMinLimit = 1;
424     // |PutInBucket()| is called on a full bucket, which should not overflow.
425     constexpr size_t kMaxLimit = std::numeric_limits<uint8_t>::max() - 1;
426     global_limits_[index] = static_cast<uint8_t>(
427         internal::base::clamp(value, kMinLimit, kMaxLimit));
428     PA_DCHECK(global_limits_[index] >= kMinLimit);
429     PA_DCHECK(global_limits_[index] <= kMaxLimit);
430   }
431 }
432 
433 // static
SetLargestCachedSize(size_t size)434 void ThreadCache::SetLargestCachedSize(size_t size) {
435   if (size > ThreadCache::kLargeSizeThreshold) {
436     size = ThreadCache::kLargeSizeThreshold;
437   }
438   largest_active_bucket_index_ =
439       PartitionRoot<internal::ThreadSafe>::SizeToBucketIndex(
440           size,
441           PartitionRoot<internal::ThreadSafe>::BucketDistribution::kDefault);
442   PA_CHECK(largest_active_bucket_index_ < kBucketCount);
443   ThreadCacheRegistry::Instance().SetLargestActiveBucketIndex(
444       largest_active_bucket_index_);
445 }
446 
447 // static
Create(PartitionRoot<internal::ThreadSafe> * root)448 ThreadCache* ThreadCache::Create(PartitionRoot<internal::ThreadSafe>* root) {
449   PA_CHECK(root);
450   // See comment in thread_cache.h, this is used to make sure
451   // kThreadCacheNeedleArray is kept in the final binary.
452   PA_CHECK(tools::kThreadCacheNeedleArray[0] == tools::kNeedle1);
453 
454   // Placement new and RawAlloc() are used, as otherwise when this partition is
455   // the malloc() implementation, the memory allocated for the new thread cache
456   // would make this code reentrant.
457   //
458   // This also means that deallocation must use RawFreeStatic(), hence the
459   // operator delete() implementation below.
460   size_t raw_size = root->AdjustSizeForExtrasAdd(sizeof(ThreadCache));
461   size_t usable_size;
462   bool already_zeroed;
463 
464   auto* bucket =
465       root->buckets + PartitionRoot<internal::ThreadSafe>::SizeToBucketIndex(
466                           raw_size, root->GetBucketDistribution());
467   uintptr_t buffer = root->RawAlloc(bucket, AllocFlags::kZeroFill, raw_size,
468                                     internal::PartitionPageSize(), &usable_size,
469                                     &already_zeroed);
470   ThreadCache* tcache =
471       new (internal::SlotStartAddr2Ptr(buffer)) ThreadCache(root);
472 
473   // This may allocate.
474   internal::PartitionTlsSet(internal::g_thread_cache_key, tcache);
475 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
476   // |thread_local| variables with destructors cause issues on some platforms.
477   // Since we need a destructor (to empty the thread cache), we cannot use it
478   // directly. However, TLS accesses with |thread_local| are typically faster,
479   // as it can turn into a fixed offset load from a register (GS/FS on Linux
480   // x86, for instance). On Windows, saving/restoring the last error increases
481   // cost as well.
482   //
483   // To still get good performance, use |thread_local| to store a raw pointer,
484   // and rely on the platform TLS to call the destructor.
485   internal::g_thread_cache = tcache;
486 #endif  // PA_CONFIG(THREAD_CACHE_FAST_TLS)
487 
488   return tcache;
489 }
490 
ThreadCache(PartitionRoot<> * root)491 ThreadCache::ThreadCache(PartitionRoot<>* root)
492     : should_purge_(false),
493       root_(root),
494       thread_id_(internal::base::PlatformThread::CurrentId()),
495       next_(nullptr),
496       prev_(nullptr) {
497   ThreadCacheRegistry::Instance().RegisterThreadCache(this);
498 
499   memset(&stats_, 0, sizeof(stats_));
500 
501   for (int index = 0; index < kBucketCount; index++) {
502     const auto& root_bucket = root->buckets[index];
503     Bucket* tcache_bucket = &buckets_[index];
504     tcache_bucket->freelist_head = nullptr;
505     tcache_bucket->count = 0;
506     tcache_bucket->limit.store(global_limits_[index],
507                                std::memory_order_relaxed);
508 
509     tcache_bucket->slot_size = root_bucket.slot_size;
510     // Invalid bucket.
511     if (!root_bucket.is_valid()) {
512       // Explicitly set this, as size computations iterate over all buckets.
513       tcache_bucket->limit.store(0, std::memory_order_relaxed);
514     }
515   }
516 }
517 
~ThreadCache()518 ThreadCache::~ThreadCache() {
519   ThreadCacheRegistry::Instance().UnregisterThreadCache(this);
520   Purge();
521 }
522 
523 // static
Delete(void * tcache_ptr)524 void ThreadCache::Delete(void* tcache_ptr) {
525   auto* tcache = static_cast<ThreadCache*>(tcache_ptr);
526 
527   if (!IsValid(tcache)) {
528     return;
529   }
530 
531 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
532   internal::g_thread_cache = nullptr;
533 #else
534   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
535 #endif
536 
537   auto* root = tcache->root_;
538   tcache->~ThreadCache();
539   // TreadCache was allocated using RawAlloc() and SlotStartAddr2Ptr(), so it
540   // shifted by extras, but is MTE-tagged.
541   root->RawFree(internal::SlotStartPtr2Addr(tcache_ptr));
542 
543 #if BUILDFLAG(IS_WIN)
544   // On Windows, allocations do occur during thread/process teardown, make sure
545   // they don't resurrect the thread cache.
546   //
547   // Don't MTE-tag, as it'd mess with the sentinel value.
548   //
549   // TODO(lizeb): Investigate whether this is needed on POSIX as well.
550   internal::PartitionTlsSet(internal::g_thread_cache_key,
551                             reinterpret_cast<void*>(kTombstone));
552 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
553   internal::g_thread_cache = reinterpret_cast<ThreadCache*>(kTombstone);
554 #endif
555 
556 #endif  // BUILDFLAG(IS_WIN)
557 }
558 
Bucket()559 ThreadCache::Bucket::Bucket() {
560   limit.store(0, std::memory_order_relaxed);
561 }
562 
FillBucket(size_t bucket_index)563 void ThreadCache::FillBucket(size_t bucket_index) {
564   // Filling multiple elements from the central allocator at a time has several
565   // advantages:
566   // - Amortize lock acquisition
567   // - Increase hit rate
568   // - Can improve locality, as consecutive allocations from the central
569   //   allocator will likely return close addresses, especially early on.
570   //
571   // However, do not take too many items, to prevent memory bloat.
572   //
573   // Cache filling / purging policy:
574   // We aim at keeping the buckets neither empty nor full, while minimizing
575   // requests to the central allocator.
576   //
577   // For each bucket, there is a |limit| of how many cached objects there are in
578   // the bucket, so |count| < |limit| at all times.
579   // - Clearing: limit -> limit / 2
580   // - Filling: 0 -> limit / kBatchFillRatio
581   //
582   // These thresholds are somewhat arbitrary, with these considerations:
583   // (1) Batched filling should not completely fill the bucket
584   // (2) Batched clearing should not completely clear the bucket
585   // (3) Batched filling should not be too eager
586   //
587   // If (1) and (2) do not hold, we risk oscillations of bucket filling /
588   // clearing which would greatly increase calls to the central allocator. (3)
589   // tries to keep memory usage low. So clearing half of the bucket, and filling
590   // a quarter of it are sensible defaults.
591   PA_INCREMENT_COUNTER(stats_.batch_fill_count);
592 
593   Bucket& bucket = buckets_[bucket_index];
594   // Some buckets may have a limit lower than |kBatchFillRatio|, but we still
595   // want to at least allocate a single slot, otherwise we wrongly return
596   // nullptr, which ends up deactivating the bucket.
597   //
598   // In these cases, we do not really batch bucket filling, but this is expected
599   // to be used for the largest buckets, where over-allocating is not advised.
600   int count = std::max(
601       1, bucket.limit.load(std::memory_order_relaxed) / kBatchFillRatio);
602 
603   size_t usable_size;
604   bool is_already_zeroed;
605 
606   PA_DCHECK(!root_->buckets[bucket_index].CanStoreRawSize());
607   PA_DCHECK(!root_->buckets[bucket_index].is_direct_mapped());
608 
609   size_t allocated_slots = 0;
610   // Same as calling RawAlloc() |count| times, but acquires the lock only once.
611   internal::ScopedGuard guard(root_->lock_);
612   for (int i = 0; i < count; i++) {
613     // Thread cache fill should not trigger expensive operations, to not grab
614     // the lock for a long time needlessly, but also to not inflate memory
615     // usage. Indeed, without AllocFlags::kFastPathOrReturnNull, cache
616     // fill may activate a new PartitionPage, or even a new SuperPage, which is
617     // clearly not desirable.
618     //
619     // |raw_size| is set to the slot size, as we don't know it. However, it is
620     // only used for direct-mapped allocations and single-slot ones anyway,
621     // which are not handled here.
622     uintptr_t slot_start = root_->AllocFromBucket(
623         &root_->buckets[bucket_index],
624         AllocFlags::kFastPathOrReturnNull | AllocFlags::kReturnNull,
625         root_->buckets[bucket_index].slot_size /* raw_size */,
626         internal::PartitionPageSize(), &usable_size, &is_already_zeroed);
627 
628     // Either the previous allocation would require a slow path allocation, or
629     // the central allocator is out of memory. If the bucket was filled with
630     // some objects, then the allocation will be handled normally. Otherwise,
631     // this goes to the central allocator, which will service the allocation,
632     // return nullptr or crash.
633     if (!slot_start) {
634       break;
635     }
636 
637     allocated_slots++;
638     PutInBucket(bucket, slot_start);
639   }
640 
641   cached_memory_ += allocated_slots * bucket.slot_size;
642 }
643 
ClearBucket(Bucket & bucket,size_t limit)644 void ThreadCache::ClearBucket(Bucket& bucket, size_t limit) {
645   ClearBucketHelper<true>(bucket, limit);
646 }
647 
648 template <bool crash_on_corruption>
ClearBucketHelper(Bucket & bucket,size_t limit)649 void ThreadCache::ClearBucketHelper(Bucket& bucket, size_t limit) {
650   // Avoids acquiring the lock needlessly.
651   if (!bucket.count || bucket.count <= limit) {
652     return;
653   }
654 
655   // This serves two purposes: error checking and avoiding stalls when grabbing
656   // the lock:
657   // 1. Error checking: this is pretty clear. Since this path is taken
658   //    infrequently, and is going to walk the entire freelist anyway, its
659   //    incremental cost should be very small. Indeed, we free from the tail of
660   //    the list, so all calls here will end up walking the entire freelist, and
661   //    incurring the same amount of cache misses.
662   // 2. Avoiding stalls: If one of the freelist accesses in |FreeAfter()|
663   //    triggers a major page fault, and we are running on a low-priority
664   //    thread, we don't want the thread to be blocked while holding the lock,
665   //    causing a priority inversion.
666   if constexpr (crash_on_corruption) {
667     bucket.freelist_head->CheckFreeListForThreadCache(bucket.slot_size);
668   }
669 
670   uint8_t count_before = bucket.count;
671   if (limit == 0) {
672     FreeAfter<crash_on_corruption>(bucket.freelist_head, bucket.slot_size);
673     bucket.freelist_head = nullptr;
674   } else {
675     // Free the *end* of the list, not the head, since the head contains the
676     // most recently touched memory.
677     auto* head = bucket.freelist_head;
678     size_t items = 1;  // Cannot free the freelist head.
679     while (items < limit) {
680       head = head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size);
681       items++;
682     }
683     FreeAfter<crash_on_corruption>(
684         head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size),
685         bucket.slot_size);
686     head->SetNext(nullptr);
687   }
688   bucket.count = limit;
689   uint8_t count_after = bucket.count;
690   size_t freed_memory = (count_before - count_after) * bucket.slot_size;
691   PA_DCHECK(cached_memory_ >= freed_memory);
692   cached_memory_ -= freed_memory;
693 
694   PA_DCHECK(cached_memory_ == CachedMemory());
695 }
696 
697 template <bool crash_on_corruption>
FreeAfter(internal::PartitionFreelistEntry * head,size_t slot_size)698 void ThreadCache::FreeAfter(internal::PartitionFreelistEntry* head,
699                             size_t slot_size) {
700   // Acquire the lock once. Deallocation from the same bucket are likely to be
701   // hitting the same cache lines in the central allocator, and lock
702   // acquisitions can be expensive.
703   internal::ScopedGuard guard(root_->lock_);
704   while (head) {
705     uintptr_t slot_start = internal::SlotStartPtr2Addr(head);
706     head = head->GetNextForThreadCache<crash_on_corruption>(slot_size);
707     root_->RawFreeLocked(slot_start);
708   }
709 }
710 
ResetForTesting()711 void ThreadCache::ResetForTesting() {
712   stats_.alloc_count = 0;
713   stats_.alloc_hits = 0;
714   stats_.alloc_misses = 0;
715 
716   stats_.alloc_miss_empty = 0;
717   stats_.alloc_miss_too_large = 0;
718 
719   stats_.cache_fill_count = 0;
720   stats_.cache_fill_hits = 0;
721   stats_.cache_fill_misses = 0;
722 
723   stats_.batch_fill_count = 0;
724 
725   stats_.bucket_total_memory = 0;
726   stats_.metadata_overhead = 0;
727 
728   Purge();
729   PA_CHECK(cached_memory_ == 0u);
730   should_purge_.store(false, std::memory_order_relaxed);
731 }
732 
CachedMemory() const733 size_t ThreadCache::CachedMemory() const {
734   size_t total = 0;
735   for (const Bucket& bucket : buckets_) {
736     total += bucket.count * static_cast<size_t>(bucket.slot_size);
737   }
738 
739   return total;
740 }
741 
AccumulateStats(ThreadCacheStats * stats) const742 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
743   stats->alloc_count += stats_.alloc_count;
744   stats->alloc_hits += stats_.alloc_hits;
745   stats->alloc_misses += stats_.alloc_misses;
746 
747   stats->alloc_miss_empty += stats_.alloc_miss_empty;
748   stats->alloc_miss_too_large += stats_.alloc_miss_too_large;
749 
750   stats->cache_fill_count += stats_.cache_fill_count;
751   stats->cache_fill_hits += stats_.cache_fill_hits;
752   stats->cache_fill_misses += stats_.cache_fill_misses;
753 
754   stats->batch_fill_count += stats_.batch_fill_count;
755 
756 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
757   for (size_t i = 0; i < internal::kNumBuckets + 1; i++) {
758     stats->allocs_per_bucket_[i] += stats_.allocs_per_bucket_[i];
759   }
760 #endif  // PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
761 
762   // cached_memory_ is not necessarily equal to |CachedMemory()| here, since
763   // this function can be called racily from another thread, to collect
764   // statistics. Hence no DCHECK_EQ(CachedMemory(), cached_memory_).
765   stats->bucket_total_memory += cached_memory_;
766 
767   stats->metadata_overhead += sizeof(*this);
768 }
769 
SetShouldPurge()770 void ThreadCache::SetShouldPurge() {
771   should_purge_.store(true, std::memory_order_relaxed);
772 }
773 
Purge()774 void ThreadCache::Purge() {
775   PA_REENTRANCY_GUARD(is_in_thread_cache_);
776   PurgeInternal();
777 }
778 
TryPurge()779 void ThreadCache::TryPurge() {
780   PA_REENTRANCY_GUARD(is_in_thread_cache_);
781   PurgeInternalHelper<false>();
782 }
783 
784 // static
PurgeCurrentThread()785 void ThreadCache::PurgeCurrentThread() {
786   auto* tcache = Get();
787   if (IsValid(tcache)) {
788     tcache->Purge();
789   }
790 }
791 
PurgeInternal()792 void ThreadCache::PurgeInternal() {
793   PurgeInternalHelper<true>();
794 }
795 
ResetPerThreadAllocationStatsForTesting()796 void ThreadCache::ResetPerThreadAllocationStatsForTesting() {
797   thread_alloc_stats_ = {};
798 }
799 
800 template <bool crash_on_corruption>
PurgeInternalHelper()801 void ThreadCache::PurgeInternalHelper() {
802   should_purge_.store(false, std::memory_order_relaxed);
803   // TODO(lizeb): Investigate whether lock acquisition should be less
804   // frequent.
805   //
806   // Note: iterate over all buckets, even the inactive ones. Since
807   // |largest_active_bucket_index_| can be lowered at runtime, there may be
808   // memory already cached in the inactive buckets. They should still be
809   // purged.
810   for (auto& bucket : buckets_) {
811     ClearBucketHelper<crash_on_corruption>(bucket, 0);
812   }
813 }
814 
815 }  // namespace partition_alloc
816