1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "partition_alloc/thread_cache.h"
6
7 #include <sys/types.h>
8
9 #include <algorithm>
10 #include <atomic>
11 #include <cstdint>
12
13 #include "build/build_config.h"
14 #include "partition_alloc/partition_alloc-inl.h"
15 #include "partition_alloc/partition_alloc_base/component_export.h"
16 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
17 #include "partition_alloc/partition_alloc_base/immediate_crash.h"
18 #include "partition_alloc/partition_alloc_base/time/time.h"
19 #include "partition_alloc/partition_alloc_buildflags.h"
20 #include "partition_alloc/partition_alloc_check.h"
21 #include "partition_alloc/partition_alloc_config.h"
22 #include "partition_alloc/partition_alloc_constants.h"
23 #include "partition_alloc/partition_root.h"
24
25 namespace partition_alloc {
26
27 namespace {
28 ThreadCacheRegistry g_instance;
29 } // namespace
30
31 namespace tools {
32 uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize] = {
33 kNeedle1, reinterpret_cast<uintptr_t>(&g_instance),
34 #if BUILDFLAG(RECORD_ALLOC_INFO)
35 reinterpret_cast<uintptr_t>(&internal::g_allocs),
36 #else
37 0,
38 #endif
39 kNeedle2};
40 } // namespace tools
41
42 namespace internal {
43
44 PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
45 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
46 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
47 thread_local ThreadCache* g_thread_cache;
48 #endif
49
50 } // namespace internal
51
52 namespace {
53 // Since |g_thread_cache_key| is shared, make sure that no more than one
54 // PartitionRoot can use it.
55 static std::atomic<PartitionRoot*> g_thread_cache_root;
56
57 #if BUILDFLAG(IS_WIN)
OnDllProcessDetach()58 void OnDllProcessDetach() {
59 // Very late allocations do occur (see crbug.com/1159411#c7 for instance),
60 // including during CRT teardown. This is problematic for the thread cache
61 // which relies on the CRT for TLS access for instance. This cannot be
62 // mitigated inside the thread cache (since getting to it requires querying
63 // TLS), but the PartitionRoot associated wih the thread cache can be made to
64 // not use the thread cache anymore.
65 g_thread_cache_root.load(std::memory_order_relaxed)
66 ->settings.with_thread_cache = false;
67 }
68 #endif
69
70 static bool g_thread_cache_key_created = false;
71 } // namespace
72
73 uint8_t ThreadCache::global_limits_[ThreadCache::kBucketCount];
74
75 // Start with the normal size, not the maximum one.
76 uint16_t ThreadCache::largest_active_bucket_index_ =
77 internal::BucketIndexLookup::GetIndex(ThreadCache::kDefaultSizeThreshold);
78
79 // static
Instance()80 ThreadCacheRegistry& ThreadCacheRegistry::Instance() {
81 return g_instance;
82 }
83
RegisterThreadCache(ThreadCache * cache)84 void ThreadCacheRegistry::RegisterThreadCache(ThreadCache* cache) {
85 internal::ScopedGuard scoped_locker(GetLock());
86 cache->next_ = nullptr;
87 cache->prev_ = nullptr;
88
89 ThreadCache* previous_head = list_head_;
90 list_head_ = cache;
91 cache->next_ = previous_head;
92 if (previous_head) {
93 previous_head->prev_ = cache;
94 }
95 }
96
UnregisterThreadCache(ThreadCache * cache)97 void ThreadCacheRegistry::UnregisterThreadCache(ThreadCache* cache) {
98 internal::ScopedGuard scoped_locker(GetLock());
99 if (cache->prev_) {
100 cache->prev_->next_ = cache->next_;
101 }
102 if (cache->next_) {
103 cache->next_->prev_ = cache->prev_;
104 }
105 if (cache == list_head_) {
106 list_head_ = cache->next_;
107 }
108 }
109
DumpStats(bool my_thread_only,ThreadCacheStats * stats)110 void ThreadCacheRegistry::DumpStats(bool my_thread_only,
111 ThreadCacheStats* stats) {
112 ThreadCache::EnsureThreadSpecificDataInitialized();
113 memset(reinterpret_cast<void*>(stats), 0, sizeof(ThreadCacheStats));
114
115 internal::ScopedGuard scoped_locker(GetLock());
116 if (my_thread_only) {
117 auto* tcache = ThreadCache::Get();
118 if (!ThreadCache::IsValid(tcache)) {
119 return;
120 }
121 tcache->AccumulateStats(stats);
122 } else {
123 ThreadCache* tcache = list_head_;
124 while (tcache) {
125 // Racy, as other threads are still allocating. This is not an issue,
126 // since we are only interested in statistics. However, this means that
127 // count is not necessarily equal to hits + misses for the various types
128 // of events.
129 tcache->AccumulateStats(stats);
130 tcache = tcache->next_;
131 }
132 }
133 }
134
PurgeAll()135 void ThreadCacheRegistry::PurgeAll() {
136 auto* current_thread_tcache = ThreadCache::Get();
137
138 // May take a while, don't hold the lock while purging.
139 //
140 // In most cases, the current thread is more important than other ones. For
141 // instance in renderers, it is the main thread. It is also the only thread
142 // that we can synchronously purge.
143 //
144 // The reason why we trigger the purge for this one first is that assuming
145 // that all threads are allocating memory, they will start purging
146 // concurrently in the loop below. This will then make them all contend with
147 // the main thread for the partition lock, since it is acquired/released once
148 // per bucket. By purging the main thread first, we avoid these interferences
149 // for this thread at least.
150 if (ThreadCache::IsValid(current_thread_tcache)) {
151 current_thread_tcache->Purge();
152 }
153
154 {
155 internal::ScopedGuard scoped_locker(GetLock());
156 ThreadCache* tcache = list_head_;
157 while (tcache) {
158 PA_DCHECK(ThreadCache::IsValid(tcache));
159 // Cannot purge directly, need to ask the other thread to purge "at some
160 // point".
161 // Note that this will not work if the other thread is sleeping forever.
162 // TODO(lizeb): Handle sleeping threads.
163 if (tcache != current_thread_tcache) {
164 tcache->SetShouldPurge();
165 }
166 tcache = tcache->next_;
167 }
168 }
169 }
170
ForcePurgeAllThreadAfterForkUnsafe()171 void ThreadCacheRegistry::ForcePurgeAllThreadAfterForkUnsafe() {
172 internal::ScopedGuard scoped_locker(GetLock());
173 ThreadCache* tcache = list_head_;
174 while (tcache) {
175 #if BUILDFLAG(PA_DCHECK_IS_ON)
176 // Before fork(), locks are acquired in the parent process. This means that
177 // a concurrent allocation in the parent which must be filled by the central
178 // allocator (i.e. the thread cache bucket is empty) will block inside the
179 // thread cache waiting for the lock to be released.
180 //
181 // In the child process, this allocation will never complete since this
182 // thread will not be resumed. However, calling |Purge()| triggers the
183 // reentrancy guard since the parent process thread was suspended from
184 // within the thread cache.
185 // Clear the guard to prevent this from crashing.
186 tcache->is_in_thread_cache_ = false;
187 #endif
188 // There is a PA_DCHECK() in code called from |Purge()| checking that thread
189 // cache memory accounting is correct. Since we are after fork() and the
190 // other threads got interrupted mid-flight, this guarantee does not hold,
191 // and we get inconsistent results. Rather than giving up on checking this
192 // invariant in regular code, reset it here so that the PA_DCHECK()
193 // passes. See crbug.com/1216964.
194 tcache->cached_memory_ = tcache->CachedMemory();
195
196 // At this point, we should call |TryPurge|. However, due to the thread
197 // cache being possibly inconsistent at this point, this may crash. Rather
198 // than crash, we'd prefer to simply not purge, even though this may leak
199 // memory in some cases.
200 //
201 // see crbug.com/1289092 for details of the crashes.
202
203 tcache = tcache->next_;
204 }
205 }
206
SetLargestActiveBucketIndex(uint8_t largest_active_bucket_index)207 void ThreadCacheRegistry::SetLargestActiveBucketIndex(
208 uint8_t largest_active_bucket_index) {
209 largest_active_bucket_index_ = largest_active_bucket_index;
210 }
211
SetThreadCacheMultiplier(float multiplier)212 void ThreadCacheRegistry::SetThreadCacheMultiplier(float multiplier) {
213 // Two steps:
214 // - Set the global limits, which will affect newly created threads.
215 // - Enumerate all thread caches and set the limit to the global one.
216 {
217 internal::ScopedGuard scoped_locker(GetLock());
218 ThreadCache* tcache = list_head_;
219
220 // If this is called before *any* thread cache has serviced *any*
221 // allocation, which can happen in tests, and in theory in non-test code as
222 // well.
223 if (!tcache) {
224 return;
225 }
226
227 // Setting the global limit while locked, because we need |tcache->root_|.
228 ThreadCache::SetGlobalLimits(tcache->root_, multiplier);
229
230 while (tcache) {
231 PA_DCHECK(ThreadCache::IsValid(tcache));
232 for (int index = 0; index < ThreadCache::kBucketCount; index++) {
233 // This is racy, but we don't care if the limit is enforced later, and
234 // we really want to avoid atomic instructions on the fast path.
235 tcache->buckets_[index].limit.store(ThreadCache::global_limits_[index],
236 std::memory_order_relaxed);
237 }
238
239 tcache = tcache->next_;
240 }
241 }
242 }
243
SetPurgingConfiguration(const internal::base::TimeDelta min_purge_interval,const internal::base::TimeDelta max_purge_interval,const internal::base::TimeDelta default_purge_interval,size_t min_cached_memory_for_purging_bytes)244 void ThreadCacheRegistry::SetPurgingConfiguration(
245 const internal::base::TimeDelta min_purge_interval,
246 const internal::base::TimeDelta max_purge_interval,
247 const internal::base::TimeDelta default_purge_interval,
248 size_t min_cached_memory_for_purging_bytes) {
249 PA_CHECK(min_purge_interval <= default_purge_interval);
250 PA_CHECK(default_purge_interval <= max_purge_interval);
251 min_purge_interval_ = min_purge_interval;
252 max_purge_interval_ = max_purge_interval;
253 default_purge_interval_ = default_purge_interval;
254 min_cached_memory_for_purging_bytes_ = min_cached_memory_for_purging_bytes;
255 is_purging_configured_ = true;
256 }
257
RunPeriodicPurge()258 void ThreadCacheRegistry::RunPeriodicPurge() {
259 if (!periodic_purge_is_initialized_) {
260 ThreadCache::EnsureThreadSpecificDataInitialized();
261 periodic_purge_is_initialized_ = true;
262 }
263
264 PA_CHECK(is_purging_configured_);
265
266 // Summing across all threads can be slow, but is necessary. Otherwise we rely
267 // on the assumption that the current thread is a good proxy for overall
268 // allocation activity. This is not the case for all process types.
269 //
270 // Since there is no synchronization with other threads, the value is stale,
271 // which is fine.
272 size_t cached_memory_approx = 0;
273 {
274 internal::ScopedGuard scoped_locker(GetLock());
275 ThreadCache* tcache = list_head_;
276 // Can run when there is no thread cache, in which case there is nothing to
277 // do, and the task should not be rescheduled. This would typically indicate
278 // a case where the thread cache was never enabled, or got disabled.
279 if (!tcache) {
280 return;
281 }
282
283 while (tcache) {
284 cached_memory_approx += tcache->cached_memory_;
285 tcache = tcache->next_;
286 }
287 }
288
289 // If cached memory is low, this means that either memory footprint is fine,
290 // or the process is mostly idle, and not allocating much since the last
291 // purge. In this case, back off. On the other hand, if there is a lot of
292 // cached memory, make purge more frequent, but always within a set frequency
293 // range.
294 //
295 // There is a potential drawback: a process that was idle for a long time and
296 // suddenly becomes very active will take some time to go back to regularly
297 // scheduled purge with a small enough interval. This is the case for instance
298 // of a renderer moving to foreground. To mitigate that, if cached memory
299 // jumps is very large, make a greater leap to faster purging.
300 if (cached_memory_approx > 10 * min_cached_memory_for_purging_bytes_) {
301 periodic_purge_next_interval_ =
302 std::min(default_purge_interval_, periodic_purge_next_interval_ / 2);
303 } else if (cached_memory_approx > 2 * min_cached_memory_for_purging_bytes_) {
304 periodic_purge_next_interval_ =
305 std::max(min_purge_interval_, periodic_purge_next_interval_ / 2);
306 } else if (cached_memory_approx < min_cached_memory_for_purging_bytes_) {
307 periodic_purge_next_interval_ =
308 std::min(max_purge_interval_, periodic_purge_next_interval_ * 2);
309 }
310
311 // Make sure that the next interval is in the right bounds. Even though the
312 // logic above should eventually converge to a reasonable interval, if a
313 // sleeping background thread holds onto a large amount of cached memory, then
314 // |PurgeAll()| will not free any memory from it, and the first branch above
315 // can be taken repeatedly until the interval gets very small, as the amount
316 // of cached memory cannot change between calls (since we do not purge
317 // background threads, but only ask them to purge their own cache at the next
318 // allocation).
319 periodic_purge_next_interval_ = std::clamp(
320 periodic_purge_next_interval_, min_purge_interval_, max_purge_interval_);
321
322 PurgeAll();
323 }
324
GetPeriodicPurgeNextIntervalInMicroseconds() const325 int64_t ThreadCacheRegistry::GetPeriodicPurgeNextIntervalInMicroseconds()
326 const {
327 return periodic_purge_next_interval_.InMicroseconds();
328 }
329
ResetForTesting()330 void ThreadCacheRegistry::ResetForTesting() {
331 periodic_purge_next_interval_ = default_purge_interval_;
332 }
333
334 // static
EnsureThreadSpecificDataInitialized()335 void ThreadCache::EnsureThreadSpecificDataInitialized() {
336 // Using the registry lock to protect from concurrent initialization without
337 // adding a special-pupose lock.
338 internal::ScopedGuard scoped_locker(
339 ThreadCacheRegistry::Instance().GetLock());
340 if (g_thread_cache_key_created) {
341 return;
342 }
343
344 bool ok = internal::PartitionTlsCreate(&internal::g_thread_cache_key, Delete);
345 PA_CHECK(ok);
346 g_thread_cache_key_created = true;
347 }
348
349 // static
DeleteForTesting(ThreadCache * tcache)350 void ThreadCache::DeleteForTesting(ThreadCache* tcache) {
351 ThreadCache::Delete(tcache);
352 }
353
354 // static
SwapForTesting(PartitionRoot * root)355 void ThreadCache::SwapForTesting(PartitionRoot* root) {
356 auto* old_tcache = ThreadCache::Get();
357 g_thread_cache_root.store(nullptr, std::memory_order_relaxed);
358 if (old_tcache) {
359 ThreadCache::DeleteForTesting(old_tcache);
360 }
361 if (root) {
362 Init(root);
363 Create(root);
364 } else {
365 #if BUILDFLAG(IS_WIN)
366 // OnDllProcessDetach accesses g_thread_cache_root which is nullptr now.
367 internal::PartitionTlsSetOnDllProcessDetach(nullptr);
368 #endif
369 }
370 }
371
372 // static
RemoveTombstoneForTesting()373 void ThreadCache::RemoveTombstoneForTesting() {
374 PA_CHECK(IsTombstone(Get()));
375 internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
376 }
377
378 // static
Init(PartitionRoot * root)379 void ThreadCache::Init(PartitionRoot* root) {
380 #if BUILDFLAG(IS_NACL)
381 static_assert(false, "PartitionAlloc isn't supported for NaCl");
382 #endif
383 PA_CHECK(root->buckets[kBucketCount - 1].slot_size ==
384 ThreadCache::kLargeSizeThreshold);
385 PA_CHECK(root->buckets[largest_active_bucket_index_].slot_size ==
386 ThreadCache::kDefaultSizeThreshold);
387
388 EnsureThreadSpecificDataInitialized();
389
390 // Make sure that only one PartitionRoot wants a thread cache.
391 PartitionRoot* expected = nullptr;
392 if (!g_thread_cache_root.compare_exchange_strong(expected, root,
393 std::memory_order_seq_cst,
394 std::memory_order_seq_cst)) {
395 PA_CHECK(false)
396 << "Only one PartitionRoot is allowed to have a thread cache";
397 }
398
399 #if BUILDFLAG(IS_WIN)
400 internal::PartitionTlsSetOnDllProcessDetach(OnDllProcessDetach);
401 #endif
402
403 SetGlobalLimits(root, kDefaultMultiplier);
404 }
405
406 // static
SetGlobalLimits(PartitionRoot * root,float multiplier)407 void ThreadCache::SetGlobalLimits(PartitionRoot* root, float multiplier) {
408 size_t initial_value =
409 static_cast<size_t>(kSmallBucketBaseCount) * multiplier;
410
411 for (int index = 0; index < kBucketCount; index++) {
412 const auto& root_bucket = root->buckets[index];
413 // Invalid bucket.
414 if (!root_bucket.active_slot_spans_head) {
415 global_limits_[index] = 0;
416 continue;
417 }
418
419 // Smaller allocations are more frequent, and more performance-sensitive.
420 // Cache more small objects, and fewer larger ones, to save memory.
421 size_t slot_size = root_bucket.slot_size;
422 size_t value;
423 if (slot_size <= 128) {
424 value = initial_value;
425 } else if (slot_size <= 256) {
426 value = initial_value / 2;
427 } else if (slot_size <= 512) {
428 value = initial_value / 4;
429 } else {
430 value = initial_value / 8;
431 }
432
433 // Bare minimum so that malloc() / free() in a loop will not hit the central
434 // allocator each time.
435 constexpr size_t kMinLimit = 1;
436 // |PutInBucket()| is called on a full bucket, which should not overflow.
437 constexpr size_t kMaxLimit = std::numeric_limits<uint8_t>::max() - 1;
438 global_limits_[index] =
439 static_cast<uint8_t>(std::clamp(value, kMinLimit, kMaxLimit));
440 PA_DCHECK(global_limits_[index] >= kMinLimit);
441 PA_DCHECK(global_limits_[index] <= kMaxLimit);
442 }
443 }
444
445 // static
SetLargestCachedSize(size_t size)446 void ThreadCache::SetLargestCachedSize(size_t size) {
447 if (size > ThreadCache::kLargeSizeThreshold) {
448 size = ThreadCache::kLargeSizeThreshold;
449 }
450 largest_active_bucket_index_ = PartitionRoot::SizeToBucketIndex(
451 size, PartitionRoot::BucketDistribution::kNeutral);
452 PA_CHECK(largest_active_bucket_index_ < kBucketCount);
453 ThreadCacheRegistry::Instance().SetLargestActiveBucketIndex(
454 largest_active_bucket_index_);
455 }
456
457 // static
Create(PartitionRoot * root)458 ThreadCache* ThreadCache::Create(PartitionRoot* root) {
459 PA_CHECK(root);
460 // See comment in thread_cache.h, this is used to make sure
461 // kThreadCacheNeedleArray is kept in the final binary.
462 PA_CHECK(tools::kThreadCacheNeedleArray[0] == tools::kNeedle1);
463
464 // Placement new and RawAlloc() are used, as otherwise when this partition is
465 // the malloc() implementation, the memory allocated for the new thread cache
466 // would make this code reentrant.
467 //
468 // This also means that deallocation must use RawFreeStatic(), hence the
469 // operator delete() implementation below.
470 size_t raw_size = root->AdjustSizeForExtrasAdd(sizeof(ThreadCache));
471 size_t usable_size;
472 bool already_zeroed;
473
474 auto* bucket = root->buckets + PartitionRoot::SizeToBucketIndex(
475 raw_size, root->GetBucketDistribution());
476 uintptr_t buffer = root->RawAlloc<AllocFlags::kZeroFill>(
477 bucket, raw_size, internal::PartitionPageSize(), &usable_size,
478 &already_zeroed);
479 ThreadCache* tcache =
480 new (internal::SlotStartAddr2Ptr(buffer)) ThreadCache(root);
481
482 // This may allocate.
483 internal::PartitionTlsSet(internal::g_thread_cache_key, tcache);
484 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
485 // |thread_local| variables with destructors cause issues on some platforms.
486 // Since we need a destructor (to empty the thread cache), we cannot use it
487 // directly. However, TLS accesses with |thread_local| are typically faster,
488 // as it can turn into a fixed offset load from a register (GS/FS on Linux
489 // x86, for instance). On Windows, saving/restoring the last error increases
490 // cost as well.
491 //
492 // To still get good performance, use |thread_local| to store a raw pointer,
493 // and rely on the platform TLS to call the destructor.
494 internal::g_thread_cache = tcache;
495 #endif // PA_CONFIG(THREAD_CACHE_FAST_TLS)
496
497 return tcache;
498 }
499
ThreadCache(PartitionRoot * root)500 ThreadCache::ThreadCache(PartitionRoot* root)
501 : should_purge_(false),
502 root_(root),
503 thread_id_(internal::base::PlatformThread::CurrentId()),
504 next_(nullptr),
505 prev_(nullptr) {
506 ThreadCacheRegistry::Instance().RegisterThreadCache(this);
507
508 memset(&stats_, 0, sizeof(stats_));
509
510 for (int index = 0; index < kBucketCount; index++) {
511 const auto& root_bucket = root->buckets[index];
512 Bucket* tcache_bucket = &buckets_[index];
513 tcache_bucket->freelist_head = nullptr;
514 tcache_bucket->count = 0;
515 tcache_bucket->limit.store(global_limits_[index],
516 std::memory_order_relaxed);
517
518 tcache_bucket->slot_size = root_bucket.slot_size;
519 // Invalid bucket.
520 if (!root_bucket.is_valid()) {
521 // Explicitly set this, as size computations iterate over all buckets.
522 tcache_bucket->limit.store(0, std::memory_order_relaxed);
523 }
524 }
525 }
526
~ThreadCache()527 ThreadCache::~ThreadCache() {
528 ThreadCacheRegistry::Instance().UnregisterThreadCache(this);
529 Purge();
530 }
531
532 // static
Delete(void * tcache_ptr)533 void ThreadCache::Delete(void* tcache_ptr) {
534 auto* tcache = static_cast<ThreadCache*>(tcache_ptr);
535
536 if (!IsValid(tcache)) {
537 return;
538 }
539
540 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
541 internal::g_thread_cache = nullptr;
542 #else
543 internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
544 #endif
545
546 auto* root = tcache->root_;
547 tcache->~ThreadCache();
548 // TreadCache was allocated using RawAlloc() and SlotStartAddr2Ptr(), so it
549 // shifted by extras, but is MTE-tagged.
550 root->RawFree(internal::SlotStartPtr2Addr(tcache_ptr));
551
552 #if BUILDFLAG(IS_WIN)
553 // On Windows, allocations do occur during thread/process teardown, make sure
554 // they don't resurrect the thread cache.
555 //
556 // Don't MTE-tag, as it'd mess with the sentinel value.
557 //
558 // TODO(lizeb): Investigate whether this is needed on POSIX as well.
559 internal::PartitionTlsSet(internal::g_thread_cache_key,
560 reinterpret_cast<void*>(kTombstone));
561 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
562 internal::g_thread_cache = reinterpret_cast<ThreadCache*>(kTombstone);
563 #endif
564
565 #endif // BUILDFLAG(IS_WIN)
566 }
567
Bucket()568 ThreadCache::Bucket::Bucket() {
569 limit.store(0, std::memory_order_relaxed);
570 }
571
FillBucket(size_t bucket_index)572 void ThreadCache::FillBucket(size_t bucket_index) {
573 // Filling multiple elements from the central allocator at a time has several
574 // advantages:
575 // - Amortize lock acquisition
576 // - Increase hit rate
577 // - Can improve locality, as consecutive allocations from the central
578 // allocator will likely return close addresses, especially early on.
579 //
580 // However, do not take too many items, to prevent memory bloat.
581 //
582 // Cache filling / purging policy:
583 // We aim at keeping the buckets neither empty nor full, while minimizing
584 // requests to the central allocator.
585 //
586 // For each bucket, there is a |limit| of how many cached objects there are in
587 // the bucket, so |count| < |limit| at all times.
588 // - Clearing: limit -> limit / 2
589 // - Filling: 0 -> limit / kBatchFillRatio
590 //
591 // These thresholds are somewhat arbitrary, with these considerations:
592 // (1) Batched filling should not completely fill the bucket
593 // (2) Batched clearing should not completely clear the bucket
594 // (3) Batched filling should not be too eager
595 //
596 // If (1) and (2) do not hold, we risk oscillations of bucket filling /
597 // clearing which would greatly increase calls to the central allocator. (3)
598 // tries to keep memory usage low. So clearing half of the bucket, and filling
599 // a quarter of it are sensible defaults.
600 PA_INCREMENT_COUNTER(stats_.batch_fill_count);
601
602 Bucket& bucket = buckets_[bucket_index];
603 // Some buckets may have a limit lower than |kBatchFillRatio|, but we still
604 // want to at least allocate a single slot, otherwise we wrongly return
605 // nullptr, which ends up deactivating the bucket.
606 //
607 // In these cases, we do not really batch bucket filling, but this is expected
608 // to be used for the largest buckets, where over-allocating is not advised.
609 int count = std::max(
610 1, bucket.limit.load(std::memory_order_relaxed) / kBatchFillRatio);
611
612 size_t usable_size;
613 bool is_already_zeroed;
614
615 PA_DCHECK(!root_->buckets[bucket_index].CanStoreRawSize());
616 PA_DCHECK(!root_->buckets[bucket_index].is_direct_mapped());
617
618 size_t allocated_slots = 0;
619 // Same as calling RawAlloc() |count| times, but acquires the lock only once.
620 internal::ScopedGuard guard(internal::PartitionRootLock(root_));
621 for (int i = 0; i < count; i++) {
622 // Thread cache fill should not trigger expensive operations, to not grab
623 // the lock for a long time needlessly, but also to not inflate memory
624 // usage. Indeed, without AllocFlags::kFastPathOrReturnNull, cache
625 // fill may activate a new PartitionPage, or even a new SuperPage, which is
626 // clearly not desirable.
627 //
628 // |raw_size| is set to the slot size, as we don't know it. However, it is
629 // only used for direct-mapped allocations and single-slot ones anyway,
630 // which are not handled here.
631 uintptr_t slot_start =
632 root_->AllocFromBucket<AllocFlags::kFastPathOrReturnNull |
633 AllocFlags::kReturnNull>(
634 &root_->buckets[bucket_index],
635 root_->buckets[bucket_index].slot_size /* raw_size */,
636 internal::PartitionPageSize(), &usable_size, &is_already_zeroed);
637
638 // Either the previous allocation would require a slow path allocation, or
639 // the central allocator is out of memory. If the bucket was filled with
640 // some objects, then the allocation will be handled normally. Otherwise,
641 // this goes to the central allocator, which will service the allocation,
642 // return nullptr or crash.
643 if (!slot_start) {
644 break;
645 }
646
647 allocated_slots++;
648 PutInBucket(bucket, slot_start);
649 }
650
651 cached_memory_ += allocated_slots * bucket.slot_size;
652 }
653
ClearBucket(Bucket & bucket,size_t limit)654 void ThreadCache::ClearBucket(Bucket& bucket, size_t limit) {
655 ClearBucketHelper<true>(bucket, limit);
656 }
657
658 template <bool crash_on_corruption>
ClearBucketHelper(Bucket & bucket,size_t limit)659 void ThreadCache::ClearBucketHelper(Bucket& bucket, size_t limit) {
660 // Avoids acquiring the lock needlessly.
661 if (!bucket.count || bucket.count <= limit) {
662 return;
663 }
664
665 // This serves two purposes: error checking and avoiding stalls when grabbing
666 // the lock:
667 // 1. Error checking: this is pretty clear. Since this path is taken
668 // infrequently, and is going to walk the entire freelist anyway, its
669 // incremental cost should be very small. Indeed, we free from the tail of
670 // the list, so all calls here will end up walking the entire freelist, and
671 // incurring the same amount of cache misses.
672 // 2. Avoiding stalls: If one of the freelist accesses in |FreeAfter()|
673 // triggers a major page fault, and we are running on a low-priority
674 // thread, we don't want the thread to be blocked while holding the lock,
675 // causing a priority inversion.
676 if constexpr (crash_on_corruption) {
677 bucket.freelist_head->CheckFreeListForThreadCache(bucket.slot_size);
678 }
679
680 uint8_t count_before = bucket.count;
681 if (limit == 0) {
682 FreeAfter<crash_on_corruption>(bucket.freelist_head, bucket.slot_size);
683 bucket.freelist_head = nullptr;
684 } else {
685 // Free the *end* of the list, not the head, since the head contains the
686 // most recently touched memory.
687 auto* head = bucket.freelist_head;
688 size_t items = 1; // Cannot free the freelist head.
689 while (items < limit) {
690 head = head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size);
691 items++;
692 }
693 FreeAfter<crash_on_corruption>(
694 head->GetNextForThreadCache<crash_on_corruption>(bucket.slot_size),
695 bucket.slot_size);
696 head->SetNext(nullptr);
697 }
698 bucket.count = limit;
699 uint8_t count_after = bucket.count;
700 size_t freed_memory = (count_before - count_after) * bucket.slot_size;
701 PA_DCHECK(cached_memory_ >= freed_memory);
702 cached_memory_ -= freed_memory;
703
704 PA_DCHECK(cached_memory_ == CachedMemory());
705 }
706
707 template <bool crash_on_corruption>
FreeAfter(internal::EncodedNextFreelistEntry * head,size_t slot_size)708 void ThreadCache::FreeAfter(internal::EncodedNextFreelistEntry* head,
709 size_t slot_size) {
710 // Acquire the lock once. Deallocation from the same bucket are likely to be
711 // hitting the same cache lines in the central allocator, and lock
712 // acquisitions can be expensive.
713 internal::ScopedGuard guard(internal::PartitionRootLock(root_));
714 while (head) {
715 uintptr_t slot_start = internal::SlotStartPtr2Addr(head);
716 head = head->GetNextForThreadCache<crash_on_corruption>(slot_size);
717 root_->RawFreeLocked(slot_start);
718 }
719 }
720
ResetForTesting()721 void ThreadCache::ResetForTesting() {
722 stats_.alloc_count = 0;
723 stats_.alloc_hits = 0;
724 stats_.alloc_misses = 0;
725
726 stats_.alloc_miss_empty = 0;
727 stats_.alloc_miss_too_large = 0;
728
729 stats_.cache_fill_count = 0;
730 stats_.cache_fill_hits = 0;
731 stats_.cache_fill_misses = 0;
732
733 stats_.batch_fill_count = 0;
734
735 stats_.bucket_total_memory = 0;
736 stats_.metadata_overhead = 0;
737
738 Purge();
739 PA_CHECK(cached_memory_ == 0u);
740 should_purge_.store(false, std::memory_order_relaxed);
741 }
742
CachedMemory() const743 size_t ThreadCache::CachedMemory() const {
744 size_t total = 0;
745 for (const Bucket& bucket : buckets_) {
746 total += bucket.count * static_cast<size_t>(bucket.slot_size);
747 }
748
749 return total;
750 }
751
AccumulateStats(ThreadCacheStats * stats) const752 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
753 stats->alloc_count += stats_.alloc_count;
754 stats->alloc_hits += stats_.alloc_hits;
755 stats->alloc_misses += stats_.alloc_misses;
756
757 stats->alloc_miss_empty += stats_.alloc_miss_empty;
758 stats->alloc_miss_too_large += stats_.alloc_miss_too_large;
759
760 stats->cache_fill_count += stats_.cache_fill_count;
761 stats->cache_fill_hits += stats_.cache_fill_hits;
762 stats->cache_fill_misses += stats_.cache_fill_misses;
763
764 stats->batch_fill_count += stats_.batch_fill_count;
765
766 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
767 for (size_t i = 0; i < internal::kNumBuckets + 1; i++) {
768 stats->allocs_per_bucket_[i] += stats_.allocs_per_bucket_[i];
769 }
770 #endif // PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
771
772 // cached_memory_ is not necessarily equal to |CachedMemory()| here, since
773 // this function can be called racily from another thread, to collect
774 // statistics. Hence no DCHECK_EQ(CachedMemory(), cached_memory_).
775 stats->bucket_total_memory += cached_memory_;
776
777 stats->metadata_overhead += sizeof(*this);
778 }
779
SetShouldPurge()780 void ThreadCache::SetShouldPurge() {
781 should_purge_.store(true, std::memory_order_relaxed);
782 }
783
Purge()784 void ThreadCache::Purge() {
785 PA_REENTRANCY_GUARD(is_in_thread_cache_);
786 PurgeInternal();
787 }
788
TryPurge()789 void ThreadCache::TryPurge() {
790 PA_REENTRANCY_GUARD(is_in_thread_cache_);
791 PurgeInternalHelper<false>();
792 }
793
794 // static
PurgeCurrentThread()795 void ThreadCache::PurgeCurrentThread() {
796 auto* tcache = Get();
797 if (IsValid(tcache)) {
798 tcache->Purge();
799 }
800 }
801
PurgeInternal()802 void ThreadCache::PurgeInternal() {
803 PurgeInternalHelper<true>();
804 }
805
ResetPerThreadAllocationStatsForTesting()806 void ThreadCache::ResetPerThreadAllocationStatsForTesting() {
807 thread_alloc_stats_ = {};
808 }
809
810 template <bool crash_on_corruption>
PurgeInternalHelper()811 void ThreadCache::PurgeInternalHelper() {
812 should_purge_.store(false, std::memory_order_relaxed);
813 // TODO(lizeb): Investigate whether lock acquisition should be less
814 // frequent.
815 //
816 // Note: iterate over all buckets, even the inactive ones. Since
817 // |largest_active_bucket_index_| can be lowered at runtime, there may be
818 // memory already cached in the inactive buckets. They should still be
819 // purged.
820 for (auto& bucket : buckets_) {
821 ClearBucketHelper<crash_on_corruption>(bucket, 0);
822 }
823 }
824
825 } // namespace partition_alloc
826