1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
7
8 #include <atomic>
9 #include <cstdint>
10 #include <limits>
11 #include <memory>
12
13 #include "build/build_config.h"
14 #include "partition_alloc/partition_alloc-inl.h"
15 #include "partition_alloc/partition_alloc_base/compiler_specific.h"
16 #include "partition_alloc/partition_alloc_base/component_export.h"
17 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
18 #include "partition_alloc/partition_alloc_base/thread_annotations.h"
19 #include "partition_alloc/partition_alloc_base/time/time.h"
20 #include "partition_alloc/partition_alloc_buildflags.h"
21 #include "partition_alloc/partition_alloc_config.h"
22 #include "partition_alloc/partition_alloc_forward.h"
23 #include "partition_alloc/partition_bucket_lookup.h"
24 #include "partition_alloc/partition_freelist_entry.h"
25 #include "partition_alloc/partition_lock.h"
26 #include "partition_alloc/partition_stats.h"
27 #include "partition_alloc/partition_tls.h"
28
29 #if defined(ARCH_CPU_X86_64) && BUILDFLAG(HAS_64_BIT_POINTERS)
30 #include <algorithm>
31 #endif
32
33 namespace partition_alloc {
34
35 class ThreadCache;
36
37 namespace tools {
38
39 // This is used from ThreadCacheInspector, which runs in a different process. It
40 // scans the process memory looking for the two needles, to locate the thread
41 // cache registry instance.
42 //
43 // These two values were chosen randomly, and in particular neither is a valid
44 // pointer on most 64 bit architectures.
45 #if BUILDFLAG(HAS_64_BIT_POINTERS)
46 constexpr uintptr_t kNeedle1 = 0xe69e32f3ad9ea63;
47 constexpr uintptr_t kNeedle2 = 0x9615ee1c5eb14caf;
48 #else
49 constexpr uintptr_t kNeedle1 = 0xe69e32f3;
50 constexpr uintptr_t kNeedle2 = 0x9615ee1c;
51 #endif // BUILDFLAG(HAS_64_BIT_POINTERS)
52
53 // This array contains, in order:
54 // - kNeedle1
55 // - &ThreadCacheRegistry::Instance()
56 // - kNeedle2
57 //
58 // It is refererenced in the thread cache constructor to make sure it is not
59 // removed by the compiler. It is also not const to make sure it ends up in
60 // .data.
61 constexpr size_t kThreadCacheNeedleArraySize = 4;
62 extern uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize];
63
64 class HeapDumper;
65 class ThreadCacheInspector;
66
67 } // namespace tools
68
69 namespace internal {
70
71 extern PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
72
73 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
74 extern PA_COMPONENT_EXPORT(
75 PARTITION_ALLOC) thread_local ThreadCache* g_thread_cache;
76 #endif
77
78 } // namespace internal
79
80 struct ThreadCacheLimits {
81 // When trying to conserve memory, set the thread cache limit to this.
82 static constexpr size_t kDefaultSizeThreshold = 512;
83 // 32kiB is chosen here as from local experiments, "zone" allocation in
84 // V8 is performance-sensitive, and zones can (and do) grow up to 32kiB for
85 // each individual allocation.
86 static constexpr size_t kLargeSizeThreshold = 1 << 15;
87 static_assert(kLargeSizeThreshold <= std::numeric_limits<uint16_t>::max(),
88 "");
89 };
90
91 constexpr internal::base::TimeDelta kMinPurgeInterval =
92 internal::base::Seconds(1);
93 constexpr internal::base::TimeDelta kMaxPurgeInterval =
94 internal::base::Minutes(1);
95 constexpr internal::base::TimeDelta kDefaultPurgeInterval =
96 2 * kMinPurgeInterval;
97 constexpr size_t kMinCachedMemoryForPurgingBytes = 500 * 1024;
98
99 // Global registry of all ThreadCache instances.
100 //
101 // This class cannot allocate in the (Un)registerThreadCache() functions, as
102 // they are called from ThreadCache constructor, which is from within the
103 // allocator. However the other members can allocate.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)104 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCacheRegistry {
105 public:
106 static ThreadCacheRegistry& Instance();
107 // Do not instantiate.
108 //
109 // Several things are surprising here:
110 // - The constructor is public even though this is intended to be a singleton:
111 // we cannot use a "static local" variable in |Instance()| as this is
112 // reached too early during CRT initialization on Windows, meaning that
113 // static local variables don't work (as they call into the uninitialized
114 // runtime). To sidestep that, we use a regular global variable in the .cc,
115 // which is fine as this object's constructor is constexpr.
116 // - Marked inline so that the chromium style plugin doesn't complain that a
117 // "complex constructor" has an inline body. This warning is disabled when
118 // the constructor is explicitly marked "inline". Note that this is a false
119 // positive of the plugin, since constexpr implies inline.
120 inline constexpr ThreadCacheRegistry();
121
122 void RegisterThreadCache(ThreadCache* cache);
123 void UnregisterThreadCache(ThreadCache* cache);
124 // Prints statistics for all thread caches, or this thread's only.
125 void DumpStats(bool my_thread_only, ThreadCacheStats* stats);
126 // Purge() this thread's cache, and asks the other ones to trigger Purge() at
127 // a later point (during a deallocation).
128 void PurgeAll();
129
130 // Runs `PurgeAll` and updates the next interval which
131 // `GetPeriodicPurgeNextIntervalInMicroseconds` returns.
132 //
133 // Note that it's a caller's responsibility to invoke this member function
134 // periodically with an appropriate interval. This function does not schedule
135 // any task nor timer.
136 void RunPeriodicPurge();
137 // Returns the appropriate interval to invoke `RunPeriodicPurge` next time.
138 int64_t GetPeriodicPurgeNextIntervalInMicroseconds() const;
139
140 // Controls the thread cache size, by setting the multiplier to a value above
141 // or below |ThreadCache::kDefaultMultiplier|.
142 void SetThreadCacheMultiplier(float multiplier);
143 void SetLargestActiveBucketIndex(uint8_t largest_active_bucket_index);
144
145 // Controls the thread cache purging configuration.
146 void SetPurgingConfiguration(
147 const internal::base::TimeDelta min_purge_interval,
148 const internal::base::TimeDelta max_purge_interval,
149 const internal::base::TimeDelta default_purge_interval,
150 size_t min_cached_memory_for_purging_bytes);
151 internal::base::TimeDelta min_purge_interval() const {
152 return min_purge_interval_;
153 }
154 internal::base::TimeDelta max_purge_interval() const {
155 return max_purge_interval_;
156 }
157 internal::base::TimeDelta default_purge_interval() const {
158 return default_purge_interval_;
159 }
160 size_t min_cached_memory_for_purging_bytes() const {
161 return min_cached_memory_for_purging_bytes_;
162 }
163 bool is_purging_configured() const { return is_purging_configured_; }
164
165 static internal::Lock& GetLock() { return Instance().lock_; }
166 // Purges all thread caches *now*. This is completely thread-unsafe, and
167 // should only be called in a post-fork() handler.
168 void ForcePurgeAllThreadAfterForkUnsafe();
169
170 void ResetForTesting();
171
172 private:
173 friend class tools::ThreadCacheInspector;
174 friend class tools::HeapDumper;
175
176 // Not using base::Lock as the object's constructor must be constexpr.
177 internal::Lock lock_;
178 ThreadCache* list_head_ PA_GUARDED_BY(GetLock()) = nullptr;
179 bool periodic_purge_is_initialized_ = false;
180 internal::base::TimeDelta min_purge_interval_;
181 internal::base::TimeDelta max_purge_interval_;
182 internal::base::TimeDelta default_purge_interval_;
183 size_t min_cached_memory_for_purging_bytes_ = 0u;
184 internal::base::TimeDelta periodic_purge_next_interval_;
185 bool is_purging_configured_ = false;
186
187 uint8_t largest_active_bucket_index_ = internal::BucketIndexLookup::GetIndex(
188 ThreadCacheLimits::kDefaultSizeThreshold);
189 };
190
191 constexpr ThreadCacheRegistry::ThreadCacheRegistry() = default;
192
193 #if PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
194 #define PA_INCREMENT_COUNTER(counter) ++counter
195 #else
196 #define PA_INCREMENT_COUNTER(counter) \
197 do { \
198 } while (0)
199 #endif // PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
200
201 #if BUILDFLAG(PA_DCHECK_IS_ON)
202
203 namespace internal {
204
205 class ReentrancyGuard {
206 public:
ReentrancyGuard(bool & flag)207 explicit ReentrancyGuard(bool& flag) : flag_(flag) {
208 PA_CHECK(!flag_);
209 flag_ = true;
210 }
211
~ReentrancyGuard()212 ~ReentrancyGuard() { flag_ = false; }
213
214 private:
215 bool& flag_;
216 };
217
218 } // namespace internal
219
220 #define PA_REENTRANCY_GUARD(x) \
221 internal::ReentrancyGuard guard { \
222 x \
223 }
224
225 #else // BUILDFLAG(PA_DCHECK_IS_ON)
226
227 #define PA_REENTRANCY_GUARD(x) \
228 do { \
229 } while (0)
230
231 #endif // BUILDFLAG(PA_DCHECK_IS_ON)
232
233 // Per-thread cache. *Not* threadsafe, must only be accessed from a single
234 // thread.
235 //
236 // In practice, this is easily enforced as long as only |instance| is
237 // manipulated, as it is a thread_local member. As such, any
238 // |ThreadCache::instance->*()| call will necessarily be done from a single
239 // thread.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)240 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCache {
241 public:
242 struct Bucket {
243 internal::EncodedNextFreelistEntry* freelist_head = nullptr;
244 // Want to keep sizeof(Bucket) small, using small types.
245 uint8_t count = 0;
246 std::atomic<uint8_t> limit{}; // Can be changed from another thread.
247 uint16_t slot_size = 0;
248
249 Bucket();
250 };
251
252 // Initializes the thread cache for |root|. May allocate, so should be called
253 // with the thread cache disabled on the partition side, and without the
254 // partition lock held.
255 //
256 // May only be called by a single PartitionRoot.
257 static void Init(PartitionRoot* root);
258
259 static void DeleteForTesting(ThreadCache* tcache);
260
261 // Deletes existing thread cache and creates a new one for |root|.
262 static void SwapForTesting(PartitionRoot* root);
263
264 // Removes the tombstone marker that would be returned by Get() otherwise.
265 static void RemoveTombstoneForTesting();
266
267 // Can be called several times, must be called before any ThreadCache
268 // interactions.
269 static void EnsureThreadSpecificDataInitialized();
270
271 static ThreadCache* Get() {
272 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
273 return internal::g_thread_cache;
274 #else
275 // This region isn't MTE-tagged.
276 return reinterpret_cast<ThreadCache*>(
277 internal::PartitionTlsGet(internal::g_thread_cache_key));
278 #endif
279 }
280
281 static bool IsValid(ThreadCache* tcache) {
282 // Do not MTE-untag, as it'd mess up the sentinel value.
283 return reinterpret_cast<uintptr_t>(tcache) & kTombstoneMask;
284 }
285
286 static bool IsTombstone(ThreadCache* tcache) {
287 // Do not MTE-untag, as it'd mess up the sentinel value.
288 return reinterpret_cast<uintptr_t>(tcache) == kTombstone;
289 }
290
291 // Create a new ThreadCache associated with |root|.
292 // Must be called without the partition locked, as this may allocate.
293 static ThreadCache* Create(PartitionRoot* root);
294
295 ~ThreadCache();
296
297 // Force placement new.
298 void* operator new(size_t) = delete;
299 void* operator new(size_t, void* buffer) { return buffer; }
300 void operator delete(void* ptr) = delete;
301 ThreadCache(const ThreadCache&) = delete;
302 ThreadCache(const ThreadCache&&) = delete;
303 ThreadCache& operator=(const ThreadCache&) = delete;
304
305 // Tries to put a slot at |slot_start| into the cache.
306 // The slot comes from the bucket at index |bucket_index| from the partition
307 // this cache is for.
308 //
309 // Returns true if the slot was put in the cache, and false otherwise. This
310 // can happen either because the cache is full or the allocation was too
311 // large.
312 PA_ALWAYS_INLINE bool MaybePutInCache(uintptr_t slot_start,
313 size_t bucket_index,
314 size_t* slot_size);
315
316 // Tries to allocate a memory slot from the cache.
317 // Returns 0 on failure.
318 //
319 // Has the same behavior as RawAlloc(), that is: no cookie nor ref-count
320 // handling. Sets |slot_size| to the allocated size upon success.
321 PA_ALWAYS_INLINE uintptr_t GetFromCache(size_t bucket_index,
322 size_t* slot_size);
323
324 // Asks this cache to trigger |Purge()| at a later point. Can be called from
325 // any thread.
326 void SetShouldPurge();
327 // Empties the cache.
328 // The Partition lock must *not* be held when calling this.
329 // Must be called from the thread this cache is for.
330 void Purge();
331 // |TryPurge| is the same as |Purge|, except that |TryPurge| will
332 // not crash if the thread cache is inconsistent. Normally inconsistency
333 // is a sign of a bug somewhere, so |Purge| should be preferred in most cases.
334 void TryPurge();
335 // Amount of cached memory for this thread's cache, in bytes.
336 size_t CachedMemory() const;
337 void AccumulateStats(ThreadCacheStats* stats) const;
338
339 // Purge the thread cache of the current thread, if one exists.
340 static void PurgeCurrentThread();
341
342 const ThreadAllocStats& thread_alloc_stats() const {
343 return thread_alloc_stats_;
344 }
345 size_t bucket_count_for_testing(size_t index) const {
346 return buckets_[index].count;
347 }
348
349 internal::base::PlatformThreadId thread_id() const { return thread_id_; }
350
351 // Sets the maximum size of allocations that may be cached by the thread
352 // cache. This applies to all threads. However, the maximum size is bounded by
353 // |kLargeSizeThreshold|.
354 static void SetLargestCachedSize(size_t size);
355
356 // Cumulative stats about *all* allocations made on the `root_` partition on
357 // this thread, that is not only the allocations serviced by the thread cache,
358 // but all allocations, including large and direct-mapped ones. This should in
359 // theory be split into a separate PerThread data structure, but the thread
360 // cache is the only per-thread data we have as of now.
361 //
362 // TODO(lizeb): Investigate adding a proper per-thread data structure.
363 PA_ALWAYS_INLINE void RecordAllocation(size_t size);
364 PA_ALWAYS_INLINE void RecordDeallocation(size_t size);
365 void ResetPerThreadAllocationStatsForTesting();
366
367 // Fill 1 / kBatchFillRatio * bucket.limit slots at a time.
368 static constexpr uint16_t kBatchFillRatio = 8;
369
370 // Limit for the smallest bucket will be kDefaultMultiplier *
371 // kSmallBucketBaseCount by default.
372 static constexpr float kDefaultMultiplier = 2.;
373 static constexpr uint8_t kSmallBucketBaseCount = 64;
374
375 static constexpr size_t kDefaultSizeThreshold =
376 ThreadCacheLimits::kDefaultSizeThreshold;
377 static constexpr size_t kLargeSizeThreshold =
378 ThreadCacheLimits::kLargeSizeThreshold;
379 static constexpr uint16_t kBucketCount =
380 internal::BucketIndexLookup::GetIndex(ThreadCache::kLargeSizeThreshold) +
381 1;
382 static_assert(
383 kBucketCount < internal::kNumBuckets,
384 "Cannot have more cached buckets than what the allocator supports");
385
386 const ThreadCache* prev_for_testing() const
387 PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
388 return prev_;
389 }
390 const ThreadCache* next_for_testing() const
391 PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
392 return next_;
393 }
394
395 ThreadCacheStats& stats_for_testing() { return stats_; }
396
397 Bucket& bucket_for_testing(size_t index) { return buckets_[index]; }
398 void ClearBucketForTesting(Bucket& bucket, size_t limit) {
399 ClearBucket(bucket, limit);
400 }
401
402 private:
403 friend class tools::HeapDumper;
404 friend class tools::ThreadCacheInspector;
405
406 static_assert(sizeof(Bucket) <= 2 * sizeof(void*), "Keep Bucket small.");
407
408 explicit ThreadCache(PartitionRoot* root);
409 static void Delete(void* thread_cache_ptr);
410
411 void PurgeInternal();
412 template <bool crash_on_corruption>
413 void PurgeInternalHelper();
414
415 // Fills a bucket from the central allocator.
416 void FillBucket(size_t bucket_index);
417 // Empties the |bucket| until there are at most |limit| objects in it.
418 template <bool crash_on_corruption>
419 void ClearBucketHelper(Bucket& bucket, size_t limit);
420 void ClearBucket(Bucket& bucket, size_t limit);
421 PA_ALWAYS_INLINE void PutInBucket(Bucket& bucket, uintptr_t slot_start);
422 void ResetForTesting();
423 // Releases the entire freelist starting at |head| to the root.
424 template <bool crash_on_corruption>
425 void FreeAfter(internal::EncodedNextFreelistEntry* head, size_t slot_size);
426 static void SetGlobalLimits(PartitionRoot* root, float multiplier);
427
428 // On some architectures, ThreadCache::Get() can be called and return
429 // something after the thread cache has been destroyed. In this case, we set
430 // it to this value, to signal that the thread is being terminated, and the
431 // thread cache should not be used.
432 //
433 // This happens in particular on Windows, during program termination.
434 //
435 // We choose 0x1 as the value as it is an invalid pointer value, since it is
436 // not aligned, and too low. Also, checking !(ptr & kTombstoneMask) checks for
437 // nullptr and kTombstone at the same time.
438 static constexpr uintptr_t kTombstone = 0x1;
439 static constexpr uintptr_t kTombstoneMask = ~kTombstone;
440
441 static uint8_t global_limits_[kBucketCount];
442 // Index of the largest active bucket. Not all processes/platforms will use
443 // all buckets, as using larger buckets increases the memory footprint.
444 //
445 // TODO(lizeb): Investigate making this per-thread rather than static, to
446 // improve locality, and open the door to per-thread settings.
447 static uint16_t largest_active_bucket_index_;
448
449 // These are at the beginning as they're accessed for each allocation.
450 uint32_t cached_memory_ = 0;
451 std::atomic<bool> should_purge_;
452 ThreadCacheStats stats_;
453 ThreadAllocStats thread_alloc_stats_;
454
455 // Buckets are quite big, though each is only 2 pointers.
456 Bucket buckets_[kBucketCount];
457
458 // Cold data below.
459 PartitionRoot* const root_;
460
461 const internal::base::PlatformThreadId thread_id_;
462 #if BUILDFLAG(PA_DCHECK_IS_ON)
463 bool is_in_thread_cache_ = false;
464 #endif
465
466 // Intrusive list since ThreadCacheRegistry::RegisterThreadCache() cannot
467 // allocate.
468 ThreadCache* next_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
469 ThreadCache* prev_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
470
471 friend class ThreadCacheRegistry;
472 friend class PartitionAllocThreadCacheTest;
473 friend class tools::ThreadCacheInspector;
474 };
475
MaybePutInCache(uintptr_t slot_start,size_t bucket_index,size_t * slot_size)476 PA_ALWAYS_INLINE bool ThreadCache::MaybePutInCache(uintptr_t slot_start,
477 size_t bucket_index,
478 size_t* slot_size) {
479 PA_REENTRANCY_GUARD(is_in_thread_cache_);
480 PA_INCREMENT_COUNTER(stats_.cache_fill_count);
481
482 if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
483 PA_INCREMENT_COUNTER(stats_.cache_fill_misses);
484 return false;
485 }
486
487 auto& bucket = buckets_[bucket_index];
488
489 PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
490
491 PutInBucket(bucket, slot_start);
492 cached_memory_ += bucket.slot_size;
493 PA_INCREMENT_COUNTER(stats_.cache_fill_hits);
494
495 // Relaxed ordering: we don't care about having an up-to-date or consistent
496 // value, just want it to not change while we are using it, hence using
497 // relaxed ordering, and loading into a local variable. Without it, we are
498 // gambling that the compiler would not issue multiple loads.
499 uint8_t limit = bucket.limit.load(std::memory_order_relaxed);
500 // Batched deallocation, amortizing lock acquisitions.
501 if (PA_UNLIKELY(bucket.count > limit)) {
502 ClearBucket(bucket, limit / 2);
503 }
504
505 if (PA_UNLIKELY(should_purge_.load(std::memory_order_relaxed))) {
506 PurgeInternal();
507 }
508
509 *slot_size = bucket.slot_size;
510 return true;
511 }
512
GetFromCache(size_t bucket_index,size_t * slot_size)513 PA_ALWAYS_INLINE uintptr_t ThreadCache::GetFromCache(size_t bucket_index,
514 size_t* slot_size) {
515 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
516 stats_.allocs_per_bucket_[bucket_index]++;
517 #endif
518
519 PA_REENTRANCY_GUARD(is_in_thread_cache_);
520 PA_INCREMENT_COUNTER(stats_.alloc_count);
521 // Only handle "small" allocations.
522 if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
523 PA_INCREMENT_COUNTER(stats_.alloc_miss_too_large);
524 PA_INCREMENT_COUNTER(stats_.alloc_misses);
525 return 0;
526 }
527
528 auto& bucket = buckets_[bucket_index];
529 if (PA_LIKELY(bucket.freelist_head)) {
530 PA_INCREMENT_COUNTER(stats_.alloc_hits);
531 } else {
532 PA_DCHECK(bucket.count == 0);
533 PA_INCREMENT_COUNTER(stats_.alloc_miss_empty);
534 PA_INCREMENT_COUNTER(stats_.alloc_misses);
535
536 FillBucket(bucket_index);
537
538 // Very unlikely, means that the central allocator is out of memory. Let it
539 // deal with it (may return 0, may crash).
540 if (PA_UNLIKELY(!bucket.freelist_head)) {
541 return 0;
542 }
543 }
544
545 PA_DCHECK(bucket.count != 0);
546 internal::EncodedNextFreelistEntry* entry = bucket.freelist_head;
547 // TODO(lizeb): Consider removing once crbug.com/1382658 is fixed.
548 #if BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) && \
549 BUILDFLAG(HAS_64_BIT_POINTERS)
550 // x86_64 architecture now supports 57 bits of address space, as of Ice Lake
551 // for Intel. However Chrome OS systems do not ship with kernel support for
552 // it, but with 48 bits, so all canonical addresses have the upper 16 bits
553 // zeroed (17 in practice, since the upper half of address space is reserved
554 // by the kernel).
555 constexpr uintptr_t kCanonicalPointerMask = (1ULL << 48) - 1;
556 PA_CHECK(!(reinterpret_cast<uintptr_t>(entry) & ~kCanonicalPointerMask));
557 #endif // BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) &&
558 // BUILDFLAG(HAS_64_BIT_POINTERS)
559
560 // Passes the bucket size to |GetNext()|, so that in case of freelist
561 // corruption, we know the bucket size that lead to the crash, helping to
562 // narrow down the search for culprit. |bucket| was touched just now, so this
563 // does not introduce another cache miss.
564 internal::EncodedNextFreelistEntry* next =
565 entry->GetNextForThreadCache<true>(bucket.slot_size);
566 PA_DCHECK(entry != next);
567 bucket.count--;
568 PA_DCHECK(bucket.count != 0 || !next);
569 bucket.freelist_head = next;
570 *slot_size = bucket.slot_size;
571
572 PA_DCHECK(cached_memory_ >= bucket.slot_size);
573 cached_memory_ -= bucket.slot_size;
574
575 return internal::SlotStartPtr2Addr(entry);
576 }
577
PutInBucket(Bucket & bucket,uintptr_t slot_start)578 PA_ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket,
579 uintptr_t slot_start) {
580 #if PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) && \
581 BUILDFLAG(HAS_64_BIT_POINTERS)
582 // We see freelist corruption crashes happening in the wild. These are likely
583 // due to out-of-bounds accesses in the previous slot, or to a Use-After-Free
584 // somewhere in the code.
585 //
586 // The issue is that we detect the UaF far away from the place where it
587 // happens. As a consequence, we should try to make incorrect code crash as
588 // early as possible. Poisoning memory at free() time works for UaF, but it
589 // was seen in the past to incur a high performance cost.
590 //
591 // Here, only poison the current cacheline, which we are touching anyway.
592 // TODO(lizeb): Make sure this does not hurt performance.
593
594 // Everything below requires this alignment.
595 static_assert(internal::kAlignment == 16, "");
596
597 // The pointer is always 16 bytes aligned, so its start address is always == 0
598 // % 16. Its distance to the next cacheline is
599 // `64 - ((slot_start & 63) / 16) * 16`
600 static_assert(
601 internal::kPartitionCachelineSize == 64,
602 "The computation below assumes that cache lines are 64 bytes long.");
603 int distance_to_next_cacheline_in_16_bytes = 4 - ((slot_start >> 4) & 3);
604 int slot_size_remaining_in_16_bytes =
605 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
606 // When BRP is on in the "previous slot" mode, this slot may have a BRP
607 // ref-count of the next, potentially allocated slot. Make sure we don't
608 // overwrite it.
609 (bucket.slot_size - sizeof(PartitionRefCount)) / 16;
610 #else
611 bucket.slot_size / 16;
612 #endif // BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
613
614 slot_size_remaining_in_16_bytes = std::min(
615 slot_size_remaining_in_16_bytes, distance_to_next_cacheline_in_16_bytes);
616
617 static const uint32_t poison_16_bytes[4] = {0xbadbad00, 0xbadbad00,
618 0xbadbad00, 0xbadbad00};
619 // Give a hint to the compiler in hope it'll vectorize the loop.
620 #if PA_HAS_BUILTIN(__builtin_assume_aligned)
621 void* slot_start_tagged = __builtin_assume_aligned(
622 internal::SlotStartAddr2Ptr(slot_start), internal::kAlignment);
623 #else
624 void* slot_start_tagged = internal::SlotStartAddr2Ptr(slot_start);
625 #endif
626 uint32_t* address_aligned = static_cast<uint32_t*>(slot_start_tagged);
627 for (int i = 0; i < slot_size_remaining_in_16_bytes; i++) {
628 // Clang will expand the memcpy to a 16-byte write (movups on x86).
629 memcpy(address_aligned, poison_16_bytes, sizeof(poison_16_bytes));
630 address_aligned += 4;
631 }
632 #endif // PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) &&
633 // BUILDFLAG(HAS_64_BIT_POINTERS)
634
635 auto* entry =
636 internal::EncodedNextFreelistEntry::EmplaceAndInitForThreadCache(
637 slot_start, bucket.freelist_head);
638 bucket.freelist_head = entry;
639 bucket.count++;
640 }
641
RecordAllocation(size_t size)642 PA_ALWAYS_INLINE void ThreadCache::RecordAllocation(size_t size) {
643 thread_alloc_stats_.alloc_count++;
644 thread_alloc_stats_.alloc_total_size += size;
645 }
646
RecordDeallocation(size_t size)647 PA_ALWAYS_INLINE void ThreadCache::RecordDeallocation(size_t size) {
648 thread_alloc_stats_.dealloc_count++;
649 thread_alloc_stats_.dealloc_total_size += size;
650 }
651
652 } // namespace partition_alloc
653
654 #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SRC_PARTITION_ALLOC_THREAD_CACHE_H_
655