• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/allocator/partition_allocator/partition_bucket.h"
6 
7 #include <algorithm>
8 #include <cstdint>
9 #include <tuple>
10 
11 #include "base/allocator/partition_allocator/address_pool_manager.h"
12 #include "base/allocator/partition_allocator/freeslot_bitmap.h"
13 #include "base/allocator/partition_allocator/freeslot_bitmap_constants.h"
14 #include "base/allocator/partition_allocator/oom.h"
15 #include "base/allocator/partition_allocator/page_allocator.h"
16 #include "base/allocator/partition_allocator/page_allocator_constants.h"
17 #include "base/allocator/partition_allocator/partition_address_space.h"
18 #include "base/allocator/partition_allocator/partition_alloc.h"
19 #include "base/allocator/partition_allocator/partition_alloc_base/bits.h"
20 #include "base/allocator/partition_allocator/partition_alloc_base/compiler_specific.h"
21 #include "base/allocator/partition_allocator/partition_alloc_base/component_export.h"
22 #include "base/allocator/partition_allocator/partition_alloc_base/debug/alias.h"
23 #include "base/allocator/partition_allocator/partition_alloc_base/debug/debugging_buildflags.h"
24 #include "base/allocator/partition_allocator/partition_alloc_base/immediate_crash.h"
25 #include "base/allocator/partition_allocator/partition_alloc_base/thread_annotations.h"
26 #include "base/allocator/partition_allocator/partition_alloc_buildflags.h"
27 #include "base/allocator/partition_allocator/partition_alloc_check.h"
28 #include "base/allocator/partition_allocator/partition_alloc_config.h"
29 #include "base/allocator/partition_allocator/partition_alloc_constants.h"
30 #include "base/allocator/partition_allocator/partition_alloc_forward.h"
31 #include "base/allocator/partition_allocator/partition_direct_map_extent.h"
32 #include "base/allocator/partition_allocator/partition_oom.h"
33 #include "base/allocator/partition_allocator/partition_page.h"
34 #include "base/allocator/partition_allocator/reservation_offset_table.h"
35 #include "base/allocator/partition_allocator/tagging.h"
36 #include "build/build_config.h"
37 
38 #if BUILDFLAG(USE_STARSCAN)
39 #include "base/allocator/partition_allocator/starscan/pcscan.h"
40 #endif
41 
42 namespace partition_alloc::internal {
43 
44 namespace {
45 
46 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
ShadowMetadataStart(uintptr_t super_page,pool_handle pool)47 PA_ALWAYS_INLINE uintptr_t ShadowMetadataStart(uintptr_t super_page,
48                                                pool_handle pool) {
49   uintptr_t shadow_metadata_start =
50       super_page + SystemPageSize() + ShadowPoolOffset(pool);
51   PA_DCHECK(!PartitionAddressSpace::IsInRegularPool(shadow_metadata_start));
52   PA_DCHECK(!PartitionAddressSpace::IsInBRPPool(shadow_metadata_start));
53   return shadow_metadata_start;
54 }
55 #endif
56 
57 template <bool thread_safe>
PartitionOutOfMemoryMappingFailure(PartitionRoot<thread_safe> * root,size_t size)58 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryMappingFailure(
59     PartitionRoot<thread_safe>* root,
60     size_t size) PA_LOCKS_EXCLUDED(root->lock_) {
61   PA_NO_CODE_FOLDING();
62   root->OutOfMemory(size);
63   PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
64 }
65 
66 template <bool thread_safe>
PartitionOutOfMemoryCommitFailure(PartitionRoot<thread_safe> * root,size_t size)67 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryCommitFailure(
68     PartitionRoot<thread_safe>* root,
69     size_t size) PA_LOCKS_EXCLUDED(root->lock_) {
70   PA_NO_CODE_FOLDING();
71   root->OutOfMemory(size);
72   PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
73 }
74 
75 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
76 // |start| has to be aligned to kSuperPageSize, but |end| doesn't. This means
77 // that a partial super page is allowed at the end. Since the block list uses
78 // kSuperPageSize granularity, a partial super page is considered blocked if
79 // there is a raw_ptr<T> pointing anywhere in that super page, even if doesn't
80 // point to that partially allocated region.
AreAllowedSuperPagesForBRPPool(uintptr_t start,uintptr_t end)81 bool AreAllowedSuperPagesForBRPPool(uintptr_t start, uintptr_t end) {
82   PA_DCHECK(!(start % kSuperPageSize));
83   for (uintptr_t super_page = start; super_page < end;
84        super_page += kSuperPageSize) {
85     // If any blocked super page is found inside the given memory region,
86     // the memory region is blocked.
87     if (!AddressPoolManagerBitmap::IsAllowedSuperPageForBRPPool(super_page)) {
88       AddressPoolManagerBitmap::IncrementBlocklistHitCount();
89       return false;
90     }
91   }
92   return true;
93 }
94 #endif  // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
95         // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
96 
97 // Reserves |requested_size| worth of super pages from the specified pool.
98 // If BRP pool is requested this function will honor BRP block list.
99 //
100 // The returned address will be aligned to kSuperPageSize, and so
101 // |requested_address| should be. |requested_size| doesn't have to be, however.
102 //
103 // |requested_address| is merely a hint, which will be attempted, but easily
104 // given up on if doesn't work the first time.
105 //
106 // The function doesn't need to hold root->lock_ or any other locks, because:
107 // - It (1) reserves memory, (2) then consults AreAllowedSuperPagesForBRPPool
108 //   for that memory, and (3) returns the memory if
109 //   allowed, or unreserves and decommits if not allowed. So no other
110 //   overlapping region can be allocated while executing
111 //   AreAllowedSuperPagesForBRPPool.
112 // - IsAllowedSuperPageForBRPPool (used by AreAllowedSuperPagesForBRPPool) is
113 //   designed to not need locking.
ReserveMemoryFromPool(pool_handle pool,uintptr_t requested_address,size_t requested_size)114 uintptr_t ReserveMemoryFromPool(pool_handle pool,
115                                 uintptr_t requested_address,
116                                 size_t requested_size) {
117   PA_DCHECK(!(requested_address % kSuperPageSize));
118 
119   uintptr_t reserved_address = AddressPoolManager::GetInstance().Reserve(
120       pool, requested_address, requested_size);
121 
122   // In 32-bit mode, when allocating from BRP pool, verify that the requested
123   // allocation honors the block list. Find a better address otherwise.
124 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
125   if (pool == kBRPPoolHandle) {
126     constexpr int kMaxRandomAddressTries = 10;
127     for (int i = 0; i < kMaxRandomAddressTries; ++i) {
128       if (!reserved_address ||
129           AreAllowedSuperPagesForBRPPool(reserved_address,
130                                          reserved_address + requested_size)) {
131         break;
132       }
133       AddressPoolManager::GetInstance().UnreserveAndDecommit(
134           pool, reserved_address, requested_size);
135       // No longer try to honor |requested_address|, because it didn't work for
136       // us last time.
137       reserved_address =
138           AddressPoolManager::GetInstance().Reserve(pool, 0, requested_size);
139     }
140 
141     // If the allocation attempt succeeds, we will break out of the following
142     // loop immediately.
143     //
144     // Last resort: sequentially scan the whole 32-bit address space. The number
145     // of blocked super-pages should be very small, so we expect to practically
146     // never need to run the following code. Note that it may fail to find an
147     // available super page, e.g., when it becomes available after the scan
148     // passes through it, but we accept the risk.
149     for (uintptr_t address_to_try = kSuperPageSize; address_to_try != 0;
150          address_to_try += kSuperPageSize) {
151       if (!reserved_address ||
152           AreAllowedSuperPagesForBRPPool(reserved_address,
153                                          reserved_address + requested_size)) {
154         break;
155       }
156       AddressPoolManager::GetInstance().UnreserveAndDecommit(
157           pool, reserved_address, requested_size);
158       // Reserve() can return a different pointer than attempted.
159       reserved_address = AddressPoolManager::GetInstance().Reserve(
160           pool, address_to_try, requested_size);
161     }
162 
163     // If the loop ends naturally, the last allocated region hasn't been
164     // verified. Do it now.
165     if (reserved_address &&
166         !AreAllowedSuperPagesForBRPPool(reserved_address,
167                                         reserved_address + requested_size)) {
168       AddressPoolManager::GetInstance().UnreserveAndDecommit(
169           pool, reserved_address, requested_size);
170       reserved_address = 0;
171     }
172   }
173 #endif  // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
174         // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
175 
176 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
177   // Only mark the region as belonging to the pool after it has passed the
178   // blocklist check in order to avoid a potential race with destructing a
179   // raw_ptr<T> object that points to non-PA memory in another thread.
180   // If `MarkUsed` was called earlier, the other thread could incorrectly
181   // determine that the allocation had come form PartitionAlloc.
182   if (reserved_address) {
183     AddressPoolManager::GetInstance().MarkUsed(pool, reserved_address,
184                                                requested_size);
185   }
186 #endif
187 
188   PA_DCHECK(!(reserved_address % kSuperPageSize));
189   return reserved_address;
190 }
191 
192 template <bool thread_safe>
PartitionDirectMap(PartitionRoot<thread_safe> * root,unsigned int flags,size_t raw_size,size_t slot_span_alignment)193 SlotSpanMetadata<thread_safe>* PartitionDirectMap(
194     PartitionRoot<thread_safe>* root,
195     unsigned int flags,
196     size_t raw_size,
197     size_t slot_span_alignment) {
198   PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
199             base::bits::IsPowerOfTwo(slot_span_alignment));
200 
201   // No static EXCLUSIVE_LOCKS_REQUIRED(), as the checker doesn't understand
202   // scoped unlocking.
203   root->lock_.AssertAcquired();
204 
205   const bool return_null = flags & AllocFlags::kReturnNull;
206   if (PA_UNLIKELY(raw_size > MaxDirectMapped())) {
207     if (return_null) {
208       return nullptr;
209     }
210 
211     // The lock is here to protect PA from:
212     // 1. Concurrent calls
213     // 2. Reentrant calls
214     //
215     // This is fine here however, as:
216     // 1. Concurrency: |PartitionRoot::OutOfMemory()| never returns, so the lock
217     //    will not be re-acquired, which would lead to acting on inconsistent
218     //    data that could have been modified in-between releasing and acquiring
219     //    it.
220     // 2. Reentrancy: This is why we release the lock. On some platforms,
221     //    terminating the process may free() memory, or even possibly try to
222     //    allocate some. Calling free() is fine, but will deadlock since
223     //    |PartitionRoot::lock_| is not recursive.
224     //
225     // Supporting reentrant calls properly is hard, and not a requirement for
226     // PA. However up to that point, we've only *read* data, not *written* to
227     // any state. Reentrant calls are then fine, especially as we don't continue
228     // on this path. The only downside is possibly endless recursion if the OOM
229     // handler allocates and fails to use UncheckedMalloc() or equivalent, but
230     // that's violating the contract of base::TerminateBecauseOutOfMemory().
231     ScopedUnlockGuard unlock{root->lock_};
232     PartitionExcessiveAllocationSize(raw_size);
233   }
234 
235   PartitionDirectMapExtent<thread_safe>* map_extent = nullptr;
236   PartitionPage<thread_safe>* page = nullptr;
237 
238   {
239     // Getting memory for direct-mapped allocations doesn't interact with the
240     // rest of the allocator, but takes a long time, as it involves several
241     // system calls. Although no mmap() (or equivalent) calls are made on
242     // 64 bit systems, page permissions are changed with mprotect(), which is
243     // a syscall.
244     //
245     // These calls are almost always slow (at least a couple us per syscall on a
246     // desktop Linux machine), and they also have a very long latency tail,
247     // possibly from getting descheduled. As a consequence, we should not hold
248     // the lock when performing a syscall. This is not the only problematic
249     // location, but since this one doesn't interact with the rest of the
250     // allocator, we can safely drop and then re-acquire the lock.
251     //
252     // Note that this only affects allocations that are not served out of the
253     // thread cache, but as a simple example the buffer partition in blink is
254     // frequently used for large allocations (e.g. ArrayBuffer), and frequent,
255     // small ones (e.g. WTF::String), and does not have a thread cache.
256     ScopedUnlockGuard scoped_unlock{root->lock_};
257 
258     const size_t slot_size =
259         PartitionRoot<thread_safe>::GetDirectMapSlotSize(raw_size);
260     // The super page starts with a partition page worth of metadata and guard
261     // pages, hence alignment requests ==PartitionPageSize() will be
262     // automatically satisfied. Padding is needed for higher-order alignment
263     // requests. Note, |slot_span_alignment| is at least 1 partition page.
264     const size_t padding_for_alignment =
265         slot_span_alignment - PartitionPageSize();
266     const size_t reservation_size =
267         PartitionRoot<thread_safe>::GetDirectMapReservationSize(
268             raw_size + padding_for_alignment);
269 #if BUILDFLAG(PA_DCHECK_IS_ON)
270     const size_t available_reservation_size =
271         reservation_size - padding_for_alignment -
272         PartitionRoot<thread_safe>::GetDirectMapMetadataAndGuardPagesSize();
273     PA_DCHECK(slot_size <= available_reservation_size);
274 #endif
275 
276     pool_handle pool = root->ChoosePool();
277     uintptr_t reservation_start;
278     {
279       // Reserving memory from the pool is actually not a syscall on 64 bit
280       // platforms.
281 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
282       ScopedSyscallTimer timer{root};
283 #endif
284       reservation_start = ReserveMemoryFromPool(pool, 0, reservation_size);
285     }
286     if (PA_UNLIKELY(!reservation_start)) {
287       if (return_null) {
288         return nullptr;
289       }
290 
291       PartitionOutOfMemoryMappingFailure(root, reservation_size);
292     }
293 
294     root->total_size_of_direct_mapped_pages.fetch_add(
295         reservation_size, std::memory_order_relaxed);
296 
297     // Shift by 1 partition page (metadata + guard pages) and alignment padding.
298     const uintptr_t slot_start =
299         reservation_start + PartitionPageSize() + padding_for_alignment;
300 
301     {
302       ScopedSyscallTimer timer{root};
303       RecommitSystemPages(reservation_start + SystemPageSize(),
304                           SystemPageSize(),
305 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
306                           root->PageAccessibilityWithPkeyIfEnabled(
307                               PageAccessibilityConfiguration::kRead),
308 #else
309                           root->PageAccessibilityWithPkeyIfEnabled(
310                               PageAccessibilityConfiguration::kReadWrite),
311 #endif
312                           PageAccessibilityDisposition::kRequireUpdate);
313     }
314 
315 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
316     // If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
317     // used, allocate a SystemPage for RefCount "bitmap" (only one of its
318     // elements will be used).
319     if (pool == kBRPPoolHandle) {
320       ScopedSyscallTimer timer{root};
321       RecommitSystemPages(reservation_start + SystemPageSize() * 2,
322                           SystemPageSize(),
323                           root->PageAccessibilityWithPkeyIfEnabled(
324                               PageAccessibilityConfiguration::kReadWrite),
325                           PageAccessibilityDisposition::kRequireUpdate);
326     }
327 #endif
328 
329 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
330     {
331       ScopedSyscallTimer timer{root};
332       RecommitSystemPages(ShadowMetadataStart(reservation_start, pool),
333                           SystemPageSize(),
334                           root->PageAccessibilityWithPkeyIfEnabled(
335                               PageAccessibilityConfiguration::kReadWrite),
336                           PageAccessibilityDisposition::kRequireUpdate);
337     }
338 #endif
339 
340     // No need to hold root->lock_. Now that memory is reserved, no other
341     // overlapping region can be allocated (because of how pools work),
342     // so no other thread can update the same offset table entries at the
343     // same time. Furthermore, nobody will be ready these offsets until this
344     // function returns.
345     uintptr_t address_start = reservation_start;
346     uintptr_t address_end = address_start + reservation_size;
347     auto* offset_ptr = ReservationOffsetPointer(address_start);
348     uint16_t offset = 0;
349     while (address_start < address_end) {
350       PA_DCHECK(offset_ptr < GetReservationOffsetTableEnd(address_start));
351       PA_DCHECK(offset < kOffsetTagNormalBuckets);
352       *offset_ptr++ = offset++;
353       address_start += kSuperPageSize;
354     }
355 
356     auto* super_page_extent =
357         PartitionSuperPageToExtent<thread_safe>(reservation_start);
358     super_page_extent->root = root;
359     // The new structures are all located inside a fresh system page so they
360     // will all be zeroed out. These DCHECKs are for documentation and to assert
361     // our expectations of the kernel.
362     PA_DCHECK(!super_page_extent->number_of_consecutive_super_pages);
363     PA_DCHECK(!super_page_extent->next);
364 
365     PartitionPage<thread_safe>* first_page =
366         reinterpret_cast<PartitionPage<thread_safe>*>(super_page_extent) + 1;
367     page = PartitionPage<thread_safe>::FromAddr(slot_start);
368     // |first_page| and |page| may be equal, if there is no alignment padding.
369     if (page != first_page) {
370       PA_DCHECK(page > first_page);
371       PA_DCHECK(page - first_page <=
372                 PartitionPage<thread_safe>::kMaxSlotSpanMetadataOffset);
373       PA_CHECK(!first_page->is_valid);
374       first_page->has_valid_span_after_this = true;
375       first_page->slot_span_metadata_offset = page - first_page;
376     }
377     auto* metadata =
378         reinterpret_cast<PartitionDirectMapMetadata<thread_safe>*>(page);
379     // Since direct map metadata is larger than PartitionPage, make sure the
380     // first and the last bytes are on the same system page, i.e. within the
381     // super page metadata region.
382     PA_DCHECK(base::bits::AlignDown(reinterpret_cast<uintptr_t>(metadata),
383                                     SystemPageSize()) ==
384               base::bits::AlignDown(
385                   reinterpret_cast<uintptr_t>(metadata) +
386                       sizeof(PartitionDirectMapMetadata<thread_safe>) - 1,
387                   SystemPageSize()));
388     PA_DCHECK(page == &metadata->page);
389     page->is_valid = true;
390     PA_DCHECK(!page->has_valid_span_after_this);
391     PA_DCHECK(!page->slot_span_metadata_offset);
392     PA_DCHECK(!page->slot_span_metadata.next_slot_span);
393     PA_DCHECK(!page->slot_span_metadata.marked_full);
394     PA_DCHECK(!page->slot_span_metadata.num_allocated_slots);
395     PA_DCHECK(!page->slot_span_metadata.num_unprovisioned_slots);
396     PA_DCHECK(!page->slot_span_metadata.in_empty_cache());
397 
398     PA_DCHECK(!metadata->subsequent_page.subsequent_page_metadata.raw_size);
399     // Raw size is set later, by the caller.
400     metadata->subsequent_page.slot_span_metadata_offset = 1;
401 
402     PA_DCHECK(!metadata->bucket.active_slot_spans_head);
403     PA_DCHECK(!metadata->bucket.empty_slot_spans_head);
404     PA_DCHECK(!metadata->bucket.decommitted_slot_spans_head);
405     PA_DCHECK(!metadata->bucket.num_system_pages_per_slot_span);
406     PA_DCHECK(!metadata->bucket.num_full_slot_spans);
407     metadata->bucket.slot_size = slot_size;
408 
409     new (&page->slot_span_metadata)
410         SlotSpanMetadata<thread_safe>(&metadata->bucket);
411 
412     // It is typically possible to map a large range of inaccessible pages, and
413     // this is leveraged in multiple places, including the pools. However,
414     // this doesn't mean that we can commit all this memory.  For the vast
415     // majority of allocations, this just means that we crash in a slightly
416     // different place, but for callers ready to handle failures, we have to
417     // return nullptr. See crbug.com/1187404.
418     //
419     // Note that we didn't check above, because if we cannot even commit a
420     // single page, then this is likely hopeless anyway, and we will crash very
421     // soon.
422     const bool ok = root->TryRecommitSystemPagesForData(
423         slot_start, slot_size, PageAccessibilityDisposition::kRequireUpdate);
424     if (!ok) {
425       if (!return_null) {
426         PartitionOutOfMemoryCommitFailure(root, slot_size);
427       }
428 
429       {
430         ScopedSyscallTimer timer{root};
431 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
432         AddressPoolManager::GetInstance().MarkUnused(pool, reservation_start,
433                                                      reservation_size);
434 #endif
435         AddressPoolManager::GetInstance().UnreserveAndDecommit(
436             pool, reservation_start, reservation_size);
437       }
438 
439       root->total_size_of_direct_mapped_pages.fetch_sub(
440           reservation_size, std::memory_order_relaxed);
441 
442       return nullptr;
443     }
444 
445     auto* next_entry = PartitionFreelistEntry::EmplaceAndInitNull(slot_start);
446     page->slot_span_metadata.SetFreelistHead(next_entry);
447 
448     map_extent = &metadata->direct_map_extent;
449     map_extent->reservation_size = reservation_size;
450     map_extent->padding_for_alignment = padding_for_alignment;
451     map_extent->bucket = &metadata->bucket;
452   }
453 
454   root->lock_.AssertAcquired();
455 
456   // Maintain the doubly-linked list of all direct mappings.
457   map_extent->next_extent = root->direct_map_list;
458   if (map_extent->next_extent) {
459     map_extent->next_extent->prev_extent = map_extent;
460   }
461   map_extent->prev_extent = nullptr;
462   root->direct_map_list = map_extent;
463 
464   return &page->slot_span_metadata;
465 }
466 
ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size)467 uint8_t ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size) {
468   if (slot_size > MaxRegularSlotSpanSize()) {
469     // This is technically not needed, as for now all the larger slot sizes are
470     // multiples of the system page size.
471     return base::bits::AlignUp(slot_size, SystemPageSize()) / SystemPageSize();
472   }
473 
474   // Smaller slot spans waste less address space, as well as potentially lower
475   // fragmentation:
476   // - Address space: This comes from fuller SuperPages (since the tail end of a
477   //   SuperPage is more likely to be used when the slot span is smaller. Also,
478   //   if a slot span is partially used, a smaller slot span will use less
479   //   address space.
480   // - In-slot fragmentation: Slot span management code will prioritize
481   //   almost-full slot spans, as well as trying to keep empty slot spans
482   //   empty. The more granular this logic can work, the better.
483   //
484   // Since metadata space overhead is constant per-PartitionPage, keeping
485   // smaller slot spans makes sense.
486   //
487   // Underlying memory allocation is done per-PartitionPage, but memory commit
488   // is done per system page. This means that we prefer to fill the entirety of
489   // a PartitionPage with a slot span, but we can tolerate some system pages
490   // being empty at the end, as these will not cost committed or dirty memory.
491   //
492   // The choice below is, for multi-slot slot spans:
493   // - If a full PartitionPage slot span is possible with less than 2% of a
494   //   *single* system page wasted, use it. The smallest possible size wins.
495   // - Otherwise, select the size with the smallest virtual address space
496   //   loss. Allow a SlotSpan to leave some slack in its PartitionPage, up to
497   //   1/4 of the total.
498   for (size_t partition_page_count = 1;
499        partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
500        partition_page_count++) {
501     size_t candidate_size = partition_page_count * PartitionPageSize();
502     size_t waste = candidate_size % slot_size;
503     if (waste <= .02 * SystemPageSize()) {
504       return partition_page_count * NumSystemPagesPerPartitionPage();
505     }
506   }
507 
508   size_t best_count = 0;
509   size_t best_waste = std::numeric_limits<size_t>::max();
510   for (size_t partition_page_count = 1;
511        partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
512        partition_page_count++) {
513     // Prefer no slack.
514     for (size_t slack = 0; slack < partition_page_count; slack++) {
515       size_t system_page_count =
516           partition_page_count * NumSystemPagesPerPartitionPage() - slack;
517       size_t candidate_size = system_page_count * SystemPageSize();
518       size_t waste = candidate_size % slot_size;
519       if (waste < best_waste) {
520         best_waste = waste;
521         best_count = system_page_count;
522       }
523     }
524   }
525   return best_count;
526 }
527 
ComputeSystemPagesPerSlotSpanInternal(size_t slot_size)528 uint8_t ComputeSystemPagesPerSlotSpanInternal(size_t slot_size) {
529   // This works out reasonably for the current bucket sizes of the generic
530   // allocator, and the current values of partition page size and constants.
531   // Specifically, we have enough room to always pack the slots perfectly into
532   // some number of system pages. The only waste is the waste associated with
533   // unfaulted pages (i.e. wasted address space).
534   // TODO: we end up using a lot of system pages for very small sizes. For
535   // example, we'll use 12 system pages for slot size 24. The slot size is so
536   // small that the waste would be tiny with just 4, or 1, system pages.  Later,
537   // we can investigate whether there are anti-fragmentation benefits to using
538   // fewer system pages.
539   double best_waste_ratio = 1.0f;
540   uint16_t best_pages = 0;
541   if (slot_size > MaxRegularSlotSpanSize()) {
542     // TODO(ajwong): Why is there a DCHECK here for this?
543     // http://crbug.com/776537
544     PA_DCHECK(!(slot_size % SystemPageSize()));
545     best_pages = static_cast<uint16_t>(slot_size >> SystemPageShift());
546     PA_CHECK(best_pages <= std::numeric_limits<uint8_t>::max());
547     return static_cast<uint8_t>(best_pages);
548   }
549   PA_DCHECK(slot_size <= MaxRegularSlotSpanSize());
550   for (uint16_t i = NumSystemPagesPerPartitionPage() - 1;
551        i <= MaxSystemPagesPerRegularSlotSpan(); ++i) {
552     size_t page_size = i << SystemPageShift();
553     size_t num_slots = page_size / slot_size;
554     size_t waste = page_size - (num_slots * slot_size);
555     // Leaving a page unfaulted is not free; the page will occupy an empty page
556     // table entry.  Make a simple attempt to account for that.
557     //
558     // TODO(ajwong): This looks wrong. PTEs are allocated for all pages
559     // regardless of whether or not they are wasted. Should it just
560     // be waste += i * sizeof(void*)?
561     // http://crbug.com/776537
562     size_t num_remainder_pages = i & (NumSystemPagesPerPartitionPage() - 1);
563     size_t num_unfaulted_pages =
564         num_remainder_pages
565             ? (NumSystemPagesPerPartitionPage() - num_remainder_pages)
566             : 0;
567     waste += sizeof(void*) * num_unfaulted_pages;
568     double waste_ratio =
569         static_cast<double>(waste) / static_cast<double>(page_size);
570     if (waste_ratio < best_waste_ratio) {
571       best_waste_ratio = waste_ratio;
572       best_pages = i;
573     }
574   }
575   PA_DCHECK(best_pages > 0);
576   PA_CHECK(best_pages <= MaxSystemPagesPerRegularSlotSpan());
577   return static_cast<uint8_t>(best_pages);
578 }
579 
580 }  // namespace
581 
ComputeSystemPagesPerSlotSpan(size_t slot_size,bool prefer_smaller_slot_spans)582 uint8_t ComputeSystemPagesPerSlotSpan(size_t slot_size,
583                                       bool prefer_smaller_slot_spans) {
584   if (prefer_smaller_slot_spans) {
585     size_t system_page_count =
586         ComputeSystemPagesPerSlotSpanPreferSmall(slot_size);
587     size_t waste = (system_page_count * SystemPageSize()) % slot_size;
588     // In case the waste is too large (more than 5% of a page), don't try to use
589     // the "small" slot span formula. This happens when we have a lot of
590     // buckets, in some cases the formula doesn't find a nice, small size.
591     if (waste <= .05 * SystemPageSize()) {
592       return system_page_count;
593     }
594   }
595 
596   return ComputeSystemPagesPerSlotSpanInternal(slot_size);
597 }
598 
599 template <bool thread_safe>
Init(uint32_t new_slot_size)600 void PartitionBucket<thread_safe>::Init(uint32_t new_slot_size) {
601   slot_size = new_slot_size;
602   slot_size_reciprocal = kReciprocalMask / new_slot_size + 1;
603   active_slot_spans_head =
604       SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
605   empty_slot_spans_head = nullptr;
606   decommitted_slot_spans_head = nullptr;
607   num_full_slot_spans = 0;
608   bool prefer_smaller_slot_spans =
609 #if PA_CONFIG(PREFER_SMALLER_SLOT_SPANS)
610       true
611 #else
612       false
613 #endif
614       ;
615   num_system_pages_per_slot_span =
616       ComputeSystemPagesPerSlotSpan(slot_size, prefer_smaller_slot_spans);
617 }
618 
619 template <bool thread_safe>
620 PA_ALWAYS_INLINE SlotSpanMetadata<thread_safe>*
AllocNewSlotSpan(PartitionRoot<thread_safe> * root,unsigned int flags,size_t slot_span_alignment)621 PartitionBucket<thread_safe>::AllocNewSlotSpan(PartitionRoot<thread_safe>* root,
622                                                unsigned int flags,
623                                                size_t slot_span_alignment) {
624   PA_DCHECK(!(root->next_partition_page % PartitionPageSize()));
625   PA_DCHECK(!(root->next_partition_page_end % PartitionPageSize()));
626 
627   size_t num_partition_pages = get_pages_per_slot_span();
628   size_t slot_span_reservation_size = num_partition_pages
629                                       << PartitionPageShift();
630   size_t slot_span_committed_size = get_bytes_per_span();
631   PA_DCHECK(num_partition_pages <= NumPartitionPagesPerSuperPage());
632   PA_DCHECK(slot_span_committed_size % SystemPageSize() == 0);
633   PA_DCHECK(slot_span_committed_size <= slot_span_reservation_size);
634 
635   uintptr_t adjusted_next_partition_page =
636       base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
637   if (PA_UNLIKELY(adjusted_next_partition_page + slot_span_reservation_size >
638                   root->next_partition_page_end)) {
639     // AllocNewSuperPage() may crash (e.g. address space exhaustion), put data
640     // on stack.
641     PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
642     PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
643 
644     // In this case, we can no longer hand out pages from the current super page
645     // allocation. Get a new super page.
646     if (!AllocNewSuperPage(root, flags)) {
647       return nullptr;
648     }
649     // AllocNewSuperPage() updates root->next_partition_page, re-query.
650     adjusted_next_partition_page =
651         base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
652     PA_CHECK(adjusted_next_partition_page + slot_span_reservation_size <=
653              root->next_partition_page_end);
654   }
655 
656   auto* gap_start_page =
657       PartitionPage<thread_safe>::FromAddr(root->next_partition_page);
658   auto* gap_end_page =
659       PartitionPage<thread_safe>::FromAddr(adjusted_next_partition_page);
660   for (auto* page = gap_start_page; page < gap_end_page; ++page) {
661     PA_DCHECK(!page->is_valid);
662     page->has_valid_span_after_this = 1;
663   }
664   root->next_partition_page =
665       adjusted_next_partition_page + slot_span_reservation_size;
666 
667   uintptr_t slot_span_start = adjusted_next_partition_page;
668   auto* slot_span = &gap_end_page->slot_span_metadata;
669   InitializeSlotSpan(slot_span);
670   // Now that slot span is initialized, it's safe to call FromSlotStart.
671   PA_DCHECK(slot_span ==
672             SlotSpanMetadata<thread_safe>::FromSlotStart(slot_span_start));
673 
674   // System pages in the super page come in a decommited state. Commit them
675   // before vending them back.
676   // If lazy commit is enabled, pages will be committed when provisioning slots,
677   // in ProvisionMoreSlotsAndAllocOne(), not here.
678   if (!kUseLazyCommit) {
679     PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
680     PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
681     PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
682 
683     root->RecommitSystemPagesForData(
684         slot_span_start, slot_span_committed_size,
685         PageAccessibilityDisposition::kRequireUpdate);
686   }
687 
688   PA_CHECK(get_slots_per_span() <=
689            SlotSpanMetadata<ThreadSafe>::kMaxSlotsPerSlotSpan);
690 
691   // Double check that we had enough space in the super page for the new slot
692   // span.
693   PA_DCHECK(root->next_partition_page <= root->next_partition_page_end);
694 
695   return slot_span;
696 }
697 
698 template <bool thread_safe>
AllocNewSuperPageSpan(PartitionRoot<thread_safe> * root,size_t super_page_count,unsigned int flags)699 uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPageSpan(
700     PartitionRoot<thread_safe>* root,
701     size_t super_page_count,
702     unsigned int flags) {
703   PA_CHECK(super_page_count > 0);
704   PA_CHECK(super_page_count <=
705            std::numeric_limits<size_t>::max() / kSuperPageSize);
706   // Need a new super page. We want to allocate super pages in a contiguous
707   // address region as much as possible. This is important for not causing
708   // page table bloat and not fragmenting address spaces in 32 bit
709   // architectures.
710   uintptr_t requested_address = root->next_super_page;
711   pool_handle pool = root->ChoosePool();
712   uintptr_t super_page_span_start = ReserveMemoryFromPool(
713       pool, requested_address, super_page_count * kSuperPageSize);
714   if (PA_UNLIKELY(!super_page_span_start)) {
715     if (flags & AllocFlags::kReturnNull) {
716       return 0;
717     }
718 
719     // Didn't manage to get a new uncommitted super page -> address space issue.
720     ::partition_alloc::internal::ScopedUnlockGuard unlock{root->lock_};
721     PartitionOutOfMemoryMappingFailure(root, kSuperPageSize);
722   }
723 
724   uintptr_t super_page_span_end =
725       super_page_span_start + super_page_count * kSuperPageSize;
726   for (uintptr_t super_page = super_page_span_start;
727        super_page < super_page_span_end; super_page += kSuperPageSize) {
728     InitializeSuperPage(root, super_page, 0);
729   }
730   return super_page_span_start;
731 }
732 
733 template <bool thread_safe>
AllocNewSuperPage(PartitionRoot<thread_safe> * root,unsigned int flags)734 PA_ALWAYS_INLINE uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPage(
735     PartitionRoot<thread_safe>* root,
736     unsigned int flags) {
737   auto super_page = AllocNewSuperPageSpan(root, 1, flags);
738   if (PA_UNLIKELY(!super_page)) {
739     // If the `kReturnNull` flag isn't set and the allocation attempt fails,
740     // `AllocNewSuperPageSpan` should've failed with an OOM crash.
741     PA_DCHECK(flags & AllocFlags::kReturnNull);
742     return 0;
743   }
744   return SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed());
745 }
746 
747 template <bool thread_safe>
InitializeSuperPage(PartitionRoot<thread_safe> * root,uintptr_t super_page,uintptr_t requested_address)748 PA_ALWAYS_INLINE uintptr_t PartitionBucket<thread_safe>::InitializeSuperPage(
749     PartitionRoot<thread_safe>* root,
750     uintptr_t super_page,
751     uintptr_t requested_address) {
752   *ReservationOffsetPointer(super_page) = kOffsetTagNormalBuckets;
753 
754   root->total_size_of_super_pages.fetch_add(kSuperPageSize,
755                                             std::memory_order_relaxed);
756 
757   root->next_super_page = super_page + kSuperPageSize;
758   uintptr_t state_bitmap =
759       super_page + PartitionPageSize() +
760       (is_direct_mapped() ? 0 : ReservedFreeSlotBitmapSize());
761 #if BUILDFLAG(USE_STARSCAN)
762   PA_DCHECK(SuperPageStateBitmapAddr(super_page) == state_bitmap);
763   const size_t state_bitmap_reservation_size =
764       root->IsQuarantineAllowed() ? ReservedStateBitmapSize() : 0;
765   const size_t state_bitmap_size_to_commit =
766       root->IsQuarantineAllowed() ? CommittedStateBitmapSize() : 0;
767   PA_DCHECK(state_bitmap_reservation_size % PartitionPageSize() == 0);
768   PA_DCHECK(state_bitmap_size_to_commit % SystemPageSize() == 0);
769   PA_DCHECK(state_bitmap_size_to_commit <= state_bitmap_reservation_size);
770   uintptr_t payload = state_bitmap + state_bitmap_reservation_size;
771 #else
772   uintptr_t payload = state_bitmap;
773 #endif  // BUILDFLAG(USE_STARSCAN)
774 
775   root->next_partition_page = payload;
776   root->next_partition_page_end = root->next_super_page - PartitionPageSize();
777   PA_DCHECK(payload ==
778             SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed()));
779   PA_DCHECK(root->next_partition_page_end == SuperPagePayloadEnd(super_page));
780 
781   // Keep the first partition page in the super page inaccessible to serve as a
782   // guard page, except an "island" in the middle where we put page metadata and
783   // also a tiny amount of extent metadata.
784   {
785     ScopedSyscallTimer timer{root};
786     RecommitSystemPages(super_page + SystemPageSize(), SystemPageSize(),
787 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
788                         root->PageAccessibilityWithPkeyIfEnabled(
789                             PageAccessibilityConfiguration::kRead),
790 #else
791                         root->PageAccessibilityWithPkeyIfEnabled(
792                             PageAccessibilityConfiguration::kReadWrite),
793 #endif
794                         PageAccessibilityDisposition::kRequireUpdate);
795   }
796 
797 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
798   // If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
799   // used, allocate a SystemPage for RefCount bitmap.
800   if (root->ChoosePool() == kBRPPoolHandle) {
801     ScopedSyscallTimer timer{root};
802     RecommitSystemPages(super_page + SystemPageSize() * 2, SystemPageSize(),
803                         root->PageAccessibilityWithPkeyIfEnabled(
804                             PageAccessibilityConfiguration::kReadWrite),
805                         PageAccessibilityDisposition::kRequireUpdate);
806   }
807 #endif
808 
809 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
810   {
811     ScopedSyscallTimer timer{root};
812     RecommitSystemPages(ShadowMetadataStart(super_page, root->ChoosePool()),
813                         SystemPageSize(),
814                         root->PageAccessibilityWithPkeyIfEnabled(
815                             PageAccessibilityConfiguration::kReadWrite),
816                         PageAccessibilityDisposition::kRequireUpdate);
817   }
818 #endif
819 
820   // If we were after a specific address, but didn't get it, assume that
821   // the system chose a lousy address. Here most OS'es have a default
822   // algorithm that isn't randomized. For example, most Linux
823   // distributions will allocate the mapping directly before the last
824   // successful mapping, which is far from random. So we just get fresh
825   // randomness for the next mapping attempt.
826   if (requested_address && requested_address != super_page) {
827     root->next_super_page = 0;
828   }
829 
830   // We allocated a new super page so update super page metadata.
831   // First check if this is a new extent or not.
832   auto* latest_extent = PartitionSuperPageToExtent<thread_safe>(super_page);
833   // By storing the root in every extent metadata object, we have a fast way
834   // to go from a pointer within the partition to the root object.
835   latest_extent->root = root;
836   // Most new extents will be part of a larger extent, and these two fields
837   // are unused, but we initialize them to 0 so that we get a clear signal
838   // in case they are accidentally used.
839   latest_extent->number_of_consecutive_super_pages = 0;
840   latest_extent->next = nullptr;
841   latest_extent->number_of_nonempty_slot_spans = 0;
842 
843   PartitionSuperPageExtentEntry<thread_safe>* current_extent =
844       root->current_extent;
845   const bool is_new_extent = super_page != requested_address;
846   if (PA_UNLIKELY(is_new_extent)) {
847     if (PA_UNLIKELY(!current_extent)) {
848       PA_DCHECK(!root->first_extent);
849       root->first_extent = latest_extent;
850     } else {
851       PA_DCHECK(current_extent->number_of_consecutive_super_pages);
852       current_extent->next = latest_extent;
853     }
854     root->current_extent = latest_extent;
855     latest_extent->number_of_consecutive_super_pages = 1;
856   } else {
857     // We allocated next to an existing extent so just nudge the size up a
858     // little.
859     PA_DCHECK(current_extent->number_of_consecutive_super_pages);
860     ++current_extent->number_of_consecutive_super_pages;
861     PA_DCHECK(payload > SuperPagesBeginFromExtent(current_extent) &&
862               payload < SuperPagesEndFromExtent(current_extent));
863   }
864 
865   // If PCScan is used, commit the state bitmap. Otherwise, leave it uncommitted
866   // and let PartitionRoot::RegisterScannableRoot() commit it when needed. Make
867   // sure to register the super-page after it has been fully initialized.
868   // Otherwise, the concurrent scanner may try to access |extent->root| which
869   // could be not initialized yet.
870 #if BUILDFLAG(USE_STARSCAN)
871   if (root->IsQuarantineEnabled()) {
872     {
873       ScopedSyscallTimer timer{root};
874       RecommitSystemPages(state_bitmap, state_bitmap_size_to_commit,
875                           root->PageAccessibilityWithPkeyIfEnabled(
876                               PageAccessibilityConfiguration::kReadWrite),
877                           PageAccessibilityDisposition::kRequireUpdate);
878     }
879     PCScan::RegisterNewSuperPage(root, super_page);
880   }
881 #endif  // BUILDFLAG(USE_STARSCAN)
882 
883 #if BUILDFLAG(USE_FREESLOT_BITMAP)
884   // Commit the pages for freeslot bitmap.
885   if (!is_direct_mapped()) {
886     uintptr_t freeslot_bitmap_addr = super_page + PartitionPageSize();
887     PA_DCHECK(SuperPageFreeSlotBitmapAddr(super_page) == freeslot_bitmap_addr);
888     ScopedSyscallTimer timer{root};
889     RecommitSystemPages(freeslot_bitmap_addr, CommittedFreeSlotBitmapSize(),
890                         root->PageAccessibilityWithPkeyIfEnabled(
891                             PageAccessibilityConfiguration::kReadWrite),
892                         PageAccessibilityDisposition::kRequireUpdate);
893   }
894 #endif
895 
896   return payload;
897 }
898 
899 template <bool thread_safe>
InitializeSlotSpan(SlotSpanMetadata<thread_safe> * slot_span)900 PA_ALWAYS_INLINE void PartitionBucket<thread_safe>::InitializeSlotSpan(
901     SlotSpanMetadata<thread_safe>* slot_span) {
902   new (slot_span) SlotSpanMetadata<thread_safe>(this);
903 
904   slot_span->Reset();
905 
906   uint16_t num_partition_pages = get_pages_per_slot_span();
907   auto* page = reinterpret_cast<PartitionPage<thread_safe>*>(slot_span);
908   for (uint16_t i = 0; i < num_partition_pages; ++i, ++page) {
909     PA_DCHECK(i <= PartitionPage<thread_safe>::kMaxSlotSpanMetadataOffset);
910     page->slot_span_metadata_offset = i;
911     page->is_valid = true;
912   }
913 }
914 
915 template <bool thread_safe>
916 PA_ALWAYS_INLINE uintptr_t
ProvisionMoreSlotsAndAllocOne(PartitionRoot<thread_safe> * root,SlotSpanMetadata<thread_safe> * slot_span)917 PartitionBucket<thread_safe>::ProvisionMoreSlotsAndAllocOne(
918     PartitionRoot<thread_safe>* root,
919     SlotSpanMetadata<thread_safe>* slot_span) {
920   PA_DCHECK(slot_span !=
921             SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
922   size_t num_slots = slot_span->num_unprovisioned_slots;
923   PA_DCHECK(num_slots);
924   PA_DCHECK(num_slots <= get_slots_per_span());
925   // We should only get here when _every_ slot is either used or unprovisioned.
926   // (The third possible state is "on the freelist". If we have a non-empty
927   // freelist, we should not get here.)
928   PA_DCHECK(num_slots + slot_span->num_allocated_slots == get_slots_per_span());
929   // Similarly, make explicitly sure that the freelist is empty.
930   PA_DCHECK(!slot_span->get_freelist_head());
931   PA_DCHECK(!slot_span->is_full());
932 
933   uintptr_t slot_span_start =
934       SlotSpanMetadata<thread_safe>::ToSlotSpanStart(slot_span);
935   // If we got here, the first unallocated slot is either partially or fully on
936   // an uncommitted page. If the latter, it must be at the start of that page.
937   uintptr_t return_slot =
938       slot_span_start + (slot_size * slot_span->num_allocated_slots);
939   uintptr_t next_slot = return_slot + slot_size;
940   uintptr_t commit_start = base::bits::AlignUp(return_slot, SystemPageSize());
941   PA_DCHECK(next_slot > commit_start);
942   uintptr_t commit_end = base::bits::AlignUp(next_slot, SystemPageSize());
943   // If the slot was partially committed, |return_slot| and |next_slot| fall
944   // in different pages. If the slot was fully uncommitted, |return_slot| points
945   // to the page start and |next_slot| doesn't, thus only the latter gets
946   // rounded up.
947   PA_DCHECK(commit_end > commit_start);
948 
949   // The slot being returned is considered allocated.
950   slot_span->num_allocated_slots++;
951   // Round down, because a slot that doesn't fully fit in the new page(s) isn't
952   // provisioned.
953   size_t slots_to_provision = (commit_end - return_slot) / slot_size;
954   slot_span->num_unprovisioned_slots -= slots_to_provision;
955   PA_DCHECK(slot_span->num_allocated_slots +
956                 slot_span->num_unprovisioned_slots <=
957             get_slots_per_span());
958 
959   // If lazy commit is enabled, meaning system pages in the slot span come
960   // in an initially decommitted state, commit them here.
961   // Note, we can't use PageAccessibilityDisposition::kAllowKeepForPerf, because
962   // we have no knowledge which pages have been committed before (it doesn't
963   // matter on Windows anyway).
964   if (kUseLazyCommit) {
965     // TODO(lizeb): Handle commit failure.
966     root->RecommitSystemPagesForData(
967         commit_start, commit_end - commit_start,
968         PageAccessibilityDisposition::kRequireUpdate);
969   }
970 
971   if (PA_LIKELY(slot_size <= kMaxMemoryTaggingSize &&
972                 root->IsMemoryTaggingEnabled())) {
973     // Ensure the MTE-tag of the memory pointed by |return_slot| is unguessable.
974     TagMemoryRangeRandomly(return_slot, slot_size);
975   }
976 
977   // Add all slots that fit within so far committed pages to the free list.
978   PartitionFreelistEntry* prev_entry = nullptr;
979   uintptr_t next_slot_end = next_slot + slot_size;
980   size_t free_list_entries_added = 0;
981   while (next_slot_end <= commit_end) {
982     void* next_slot_ptr;
983     if (PA_LIKELY(slot_size <= kMaxMemoryTaggingSize)) {
984       // Ensure the MTE-tag of the memory pointed by other provisioned slot is
985       // unguessable. They will be returned to the app as is, and the MTE-tag
986       // will only change upon calling Free().
987       next_slot_ptr = TagMemoryRangeRandomly(next_slot, slot_size);
988     } else {
989       // No MTE-tagging for larger slots, just cast.
990       next_slot_ptr = reinterpret_cast<void*>(next_slot);
991     }
992     auto* entry = PartitionFreelistEntry::EmplaceAndInitNull(next_slot_ptr);
993     if (!slot_span->get_freelist_head()) {
994       PA_DCHECK(!prev_entry);
995       PA_DCHECK(!free_list_entries_added);
996       slot_span->SetFreelistHead(entry);
997     } else {
998       PA_DCHECK(free_list_entries_added);
999       prev_entry->SetNext(entry);
1000     }
1001 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1002     FreeSlotBitmapMarkSlotAsFree(next_slot);
1003 #endif
1004     next_slot = next_slot_end;
1005     next_slot_end = next_slot + slot_size;
1006     prev_entry = entry;
1007 #if BUILDFLAG(PA_DCHECK_IS_ON)
1008     free_list_entries_added++;
1009 #endif
1010   }
1011 
1012 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1013   FreeSlotBitmapMarkSlotAsFree(return_slot);
1014 #endif
1015 
1016 #if BUILDFLAG(PA_DCHECK_IS_ON)
1017   // The only provisioned slot not added to the free list is the one being
1018   // returned.
1019   PA_DCHECK(slots_to_provision == free_list_entries_added + 1);
1020   // We didn't necessarily provision more than one slot (e.g. if |slot_size|
1021   // is large), meaning that |slot_span->freelist_head| can be nullptr.
1022   if (slot_span->get_freelist_head()) {
1023     PA_DCHECK(free_list_entries_added);
1024     slot_span->get_freelist_head()->CheckFreeList(slot_size);
1025   }
1026 #endif
1027 
1028   // We had no free slots, and created some (potentially 0) in sorted order.
1029   slot_span->set_freelist_sorted();
1030 
1031   return return_slot;
1032 }
1033 
1034 template <bool thread_safe>
SetNewActiveSlotSpan()1035 bool PartitionBucket<thread_safe>::SetNewActiveSlotSpan() {
1036   SlotSpanMetadata<thread_safe>* slot_span = active_slot_spans_head;
1037   if (slot_span == SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
1038     return false;
1039   }
1040 
1041   SlotSpanMetadata<thread_safe>* next_slot_span;
1042 
1043   // The goal here is to find a suitable slot span in the active list. Suitable
1044   // slot spans are |is_active()|, i.e. they either have (a) freelist entries,
1045   // or (b) unprovisioned free space. The first case is preferable, since it
1046   // doesn't cost a system call, and doesn't cause new memory to become dirty.
1047   //
1048   // While looking for a new slot span, active list maintenance is performed,
1049   // that is:
1050   // - Empty and decommitted slot spans are moved to their respective lists.
1051   // - Full slot spans are removed from the active list but are not moved
1052   //   anywhere. They could be tracked in a separate list, but this would
1053   //   increase cost non trivially. Indeed, a full slot span is likely to become
1054   //   non-full at some point (due to a free() hitting it). Since we only have
1055   //   space in the metadata for a single linked list pointer, removing the
1056   //   newly-non-full slot span from the "full" list would require walking it
1057   //   (to know what's before it in the full list).
1058   //
1059   // Since we prefer slot spans with provisioned freelist entries, maintenance
1060   // happens in two stages:
1061   // 1. Walk the list to find candidates. Each of the skipped slot span is moved
1062   //    to either:
1063   //   - one of the long-lived lists: empty, decommitted
1064   //   - the temporary "active slots spans with no freelist entry" list
1065   //   - Nowhere for full slot spans.
1066   // 2. Once we have a candidate:
1067   //   - Set it as the new active list head
1068   //   - Reattach the temporary list
1069   //
1070   // Note that in most cases, the whole list will not be walked and maintained
1071   // at this stage.
1072 
1073   SlotSpanMetadata<thread_safe>* to_provision_head = nullptr;
1074   SlotSpanMetadata<thread_safe>* to_provision_tail = nullptr;
1075 
1076   for (; slot_span; slot_span = next_slot_span) {
1077     next_slot_span = slot_span->next_slot_span;
1078     PA_DCHECK(slot_span->bucket == this);
1079     PA_DCHECK(slot_span != empty_slot_spans_head);
1080     PA_DCHECK(slot_span != decommitted_slot_spans_head);
1081 
1082     if (slot_span->is_active()) {
1083       // Has provisioned slots.
1084       if (slot_span->get_freelist_head()) {
1085         // Will use this slot span, no need to go further.
1086         break;
1087       } else {
1088         // Keeping head and tail because we don't want to reverse the list.
1089         if (!to_provision_head) {
1090           to_provision_head = slot_span;
1091         }
1092         if (to_provision_tail) {
1093           to_provision_tail->next_slot_span = slot_span;
1094         }
1095         to_provision_tail = slot_span;
1096         slot_span->next_slot_span = nullptr;
1097       }
1098     } else if (slot_span->is_empty()) {
1099       slot_span->next_slot_span = empty_slot_spans_head;
1100       empty_slot_spans_head = slot_span;
1101     } else if (PA_LIKELY(slot_span->is_decommitted())) {
1102       slot_span->next_slot_span = decommitted_slot_spans_head;
1103       decommitted_slot_spans_head = slot_span;
1104     } else {
1105       PA_DCHECK(slot_span->is_full());
1106       // Move this slot span... nowhere, and also mark it as full. We need it
1107       // marked so that free'ing can tell, and move it back into the active
1108       // list.
1109       slot_span->marked_full = 1;
1110       ++num_full_slot_spans;
1111       // Overflow. Most likely a correctness issue in the code.  It is in theory
1112       // possible that the number of full slot spans really reaches (1 << 24),
1113       // but this is very unlikely (and not possible with most pool settings).
1114       PA_CHECK(num_full_slot_spans);
1115       // Not necessary but might help stop accidents.
1116       slot_span->next_slot_span = nullptr;
1117     }
1118   }
1119 
1120   bool usable_active_list_head = false;
1121   // Found an active slot span with provisioned entries on the freelist.
1122   if (slot_span) {
1123     usable_active_list_head = true;
1124     // We have active slot spans with unprovisioned entries. Re-attach them into
1125     // the active list, past the span with freelist entries.
1126     if (to_provision_head) {
1127       auto* next = slot_span->next_slot_span;
1128       slot_span->next_slot_span = to_provision_head;
1129       to_provision_tail->next_slot_span = next;
1130     }
1131     active_slot_spans_head = slot_span;
1132   } else if (to_provision_head) {
1133     usable_active_list_head = true;
1134     // Need to provision new slots.
1135     active_slot_spans_head = to_provision_head;
1136   } else {
1137     // Active list is now empty.
1138     active_slot_spans_head =
1139         SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
1140   }
1141 
1142   return usable_active_list_head;
1143 }
1144 
1145 template <bool thread_safe>
MaintainActiveList()1146 void PartitionBucket<thread_safe>::MaintainActiveList() {
1147   SlotSpanMetadata<thread_safe>* slot_span = active_slot_spans_head;
1148   if (slot_span == SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
1149     return;
1150   }
1151 
1152   SlotSpanMetadata<thread_safe>* new_active_slot_spans_head = nullptr;
1153   SlotSpanMetadata<thread_safe>* new_active_slot_spans_tail = nullptr;
1154 
1155   SlotSpanMetadata<thread_safe>* next_slot_span;
1156   for (; slot_span; slot_span = next_slot_span) {
1157     next_slot_span = slot_span->next_slot_span;
1158 
1159     if (slot_span->is_active()) {
1160       // Ordering in the active slot span list matters, don't reverse it.
1161       if (!new_active_slot_spans_head) {
1162         new_active_slot_spans_head = slot_span;
1163       }
1164       if (new_active_slot_spans_tail) {
1165         new_active_slot_spans_tail->next_slot_span = slot_span;
1166       }
1167       new_active_slot_spans_tail = slot_span;
1168       slot_span->next_slot_span = nullptr;
1169     } else if (slot_span->is_empty()) {
1170       // For the empty and decommitted lists, LIFO ordering makes sense (since
1171       // it would lead to reusing memory which has been touched relatively
1172       // recently, which only matters for committed spans though).
1173       slot_span->next_slot_span = empty_slot_spans_head;
1174       empty_slot_spans_head = slot_span;
1175     } else if (slot_span->is_decommitted()) {
1176       slot_span->next_slot_span = decommitted_slot_spans_head;
1177       decommitted_slot_spans_head = slot_span;
1178     } else {
1179       // Full slot spans are not tracked, just accounted for.
1180       PA_DCHECK(slot_span->is_full());
1181       slot_span->marked_full = 1;
1182       ++num_full_slot_spans;
1183       PA_CHECK(num_full_slot_spans);  // Overflow.
1184       slot_span->next_slot_span = nullptr;
1185     }
1186   }
1187 
1188   if (!new_active_slot_spans_head) {
1189     new_active_slot_spans_head =
1190         SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
1191   }
1192   active_slot_spans_head = new_active_slot_spans_head;
1193 }
1194 
1195 template <bool thread_safe>
SortSlotSpanFreelists()1196 void PartitionBucket<thread_safe>::SortSlotSpanFreelists() {
1197   for (auto* slot_span = active_slot_spans_head; slot_span;
1198        slot_span = slot_span->next_slot_span) {
1199     // No need to sort the freelist if it's already sorted. Note that if the
1200     // freelist is sorted, this means that it didn't change at all since the
1201     // last call. This may be a good signal to shrink it if possible (if an
1202     // entire OS page is free, we can decommit it).
1203     //
1204     // Besides saving CPU, this also avoids touching memory of fully idle slot
1205     // spans, which may required paging.
1206     if (slot_span->num_allocated_slots > 0 &&
1207         !slot_span->freelist_is_sorted()) {
1208       slot_span->SortFreelist();
1209     }
1210   }
1211 }
1212 
PA_COMPONENT_EXPORT(PARTITION_ALLOC)1213 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
1214 bool CompareSlotSpans(SlotSpanMetadata<ThreadSafe>* a,
1215                       SlotSpanMetadata<ThreadSafe>* b) {
1216   auto criteria_tuple = [](SlotSpanMetadata<ThreadSafe> const* a) {
1217     size_t freelist_length = a->GetFreelistLength();
1218     // The criteria are, in order (hence the lexicographic comparison below):
1219     // 1. Prefer slot spans with freelist entries. The ones without freelist
1220     //    entries would be skipped in SetNewActiveSlotSpan() anyway.
1221     // 2. Then the ones with the fewest freelist entries. They are either close
1222     //    to being full (for the provisioned memory), or close to being pushed
1223     //    at the end of the list (since they would not have freelist entries
1224     //    anymore, and would either fall into the first case, or be skipped by
1225     //    SetNewActiveSlotSpan()).
1226     // 3. The ones with the fewer unprovisioned slots, meaning that they are
1227     //    close to being completely full.
1228     //
1229     // Note that this sorting order is not necessarily the best one when slot
1230     // spans are partially provisioned. From local testing, in steady-state,
1231     // most slot spans are entirely provisioned (or decommitted), which may be a
1232     // consequence of the lack of partial slot span decommit, or of fairly
1233     // effective fragmentation avoidance heuristics. Make sure to evaluate
1234     // whether an alternative sorting order (sorting according to freelist size
1235     // + unprovisioned slots) makes more sense.
1236     return std::tuple<bool, size_t, size_t>{
1237         freelist_length == 0, freelist_length, a->num_unprovisioned_slots};
1238   };
1239 
1240   return criteria_tuple(a) < criteria_tuple(b);
1241 }
1242 
1243 template <bool thread_safe>
SortActiveSlotSpans()1244 void PartitionBucket<thread_safe>::SortActiveSlotSpans() {
1245   // Sorting up to |kMaxSlotSpansToSort| slot spans. This is capped for two
1246   // reasons:
1247   // - Limiting execution time
1248   // - Current code cannot allocate.
1249   //
1250   // In practice though, it's rare to have that many active slot spans.
1251   SlotSpanMetadata<thread_safe>* active_spans_array[kMaxSlotSpansToSort];
1252   size_t index = 0;
1253   SlotSpanMetadata<thread_safe>* overflow_spans_start = nullptr;
1254 
1255   for (auto* slot_span = active_slot_spans_head; slot_span;
1256        slot_span = slot_span->next_slot_span) {
1257     if (index < kMaxSlotSpansToSort) {
1258       active_spans_array[index++] = slot_span;
1259     } else {
1260       // Starting from this one, not sorting the slot spans.
1261       overflow_spans_start = slot_span;
1262       break;
1263     }
1264   }
1265 
1266   // We sort the active slot spans so that allocations are preferably serviced
1267   // from the fullest ones. This way we hope to reduce fragmentation by keeping
1268   // as few slot spans as full as possible.
1269   //
1270   // With perfect information on allocation lifespan, we would be able to pack
1271   // allocations and get almost no fragmentation. This is obviously not the
1272   // case, so we have partially full SlotSpans. Nevertheless, as a heuristic we
1273   // want to:
1274   // - Keep almost-empty slot spans as empty as possible
1275   // - Keep mostly-full slot spans as full as possible
1276   //
1277   // The first part is done in the hope that future free()s will make these
1278   // slot spans completely empty, allowing us to reclaim them. To that end, sort
1279   // SlotSpans periodically so that the fullest ones are preferred.
1280   //
1281   // std::sort() is not completely guaranteed to never allocate memory. However,
1282   // it may not throw std::bad_alloc, which constrains the implementation. In
1283   // addition, this is protected by the reentrancy guard, so we would detect
1284   // such an allocation.
1285   std::sort(active_spans_array, active_spans_array + index, CompareSlotSpans);
1286 
1287   active_slot_spans_head = overflow_spans_start;
1288 
1289   // Reverse order, since we insert at the head of the list.
1290   for (int i = index - 1; i >= 0; i--) {
1291     if (active_spans_array[i] ==
1292         SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
1293       // The sentinel is const, don't try to write to it.
1294       PA_DCHECK(active_slot_spans_head == nullptr);
1295     } else {
1296       active_spans_array[i]->next_slot_span = active_slot_spans_head;
1297     }
1298     active_slot_spans_head = active_spans_array[i];
1299   }
1300 }
1301 
1302 template <bool thread_safe>
SlowPathAlloc(PartitionRoot<thread_safe> * root,unsigned int flags,size_t raw_size,size_t slot_span_alignment,bool * is_already_zeroed)1303 uintptr_t PartitionBucket<thread_safe>::SlowPathAlloc(
1304     PartitionRoot<thread_safe>* root,
1305     unsigned int flags,
1306     size_t raw_size,
1307     size_t slot_span_alignment,
1308     bool* is_already_zeroed) {
1309   PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
1310             base::bits::IsPowerOfTwo(slot_span_alignment));
1311 
1312   // The slow path is called when the freelist is empty. The only exception is
1313   // when a higher-order alignment is requested, in which case the freelist
1314   // logic is bypassed and we go directly for slot span allocation.
1315   bool allocate_aligned_slot_span = slot_span_alignment > PartitionPageSize();
1316   PA_DCHECK(!active_slot_spans_head->get_freelist_head() ||
1317             allocate_aligned_slot_span);
1318 
1319   SlotSpanMetadata<thread_safe>* new_slot_span = nullptr;
1320   // |new_slot_span->bucket| will always be |this|, except when |this| is the
1321   // sentinel bucket, which is used to signal a direct mapped allocation.  In
1322   // this case |new_bucket| will be set properly later. This avoids a read for
1323   // most allocations.
1324   PartitionBucket* new_bucket = this;
1325   *is_already_zeroed = false;
1326 
1327   // For the PartitionRoot::Alloc() API, we have a bunch of buckets
1328   // marked as special cases. We bounce them through to the slow path so that
1329   // we can still have a blazing fast hot path due to lack of corner-case
1330   // branches.
1331   //
1332   // Note: The ordering of the conditionals matter! In particular,
1333   // SetNewActiveSlotSpan() has a side-effect even when returning
1334   // false where it sweeps the active list and may move things into the empty or
1335   // decommitted lists which affects the subsequent conditional.
1336   if (PA_UNLIKELY(is_direct_mapped())) {
1337     PA_DCHECK(raw_size > kMaxBucketed);
1338     PA_DCHECK(this == &root->sentinel_bucket);
1339     PA_DCHECK(active_slot_spans_head ==
1340               SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
1341 
1342     // No fast path for direct-mapped allocations.
1343     if (flags & AllocFlags::kFastPathOrReturnNull) {
1344       return 0;
1345     }
1346 
1347     new_slot_span =
1348         PartitionDirectMap(root, flags, raw_size, slot_span_alignment);
1349     if (new_slot_span) {
1350       new_bucket = new_slot_span->bucket;
1351     }
1352     // Memory from PageAllocator is always zeroed.
1353     *is_already_zeroed = true;
1354   } else if (PA_LIKELY(!allocate_aligned_slot_span && SetNewActiveSlotSpan())) {
1355     // First, did we find an active slot span in the active list?
1356     new_slot_span = active_slot_spans_head;
1357     PA_DCHECK(new_slot_span->is_active());
1358   } else if (PA_LIKELY(!allocate_aligned_slot_span &&
1359                        (empty_slot_spans_head != nullptr ||
1360                         decommitted_slot_spans_head != nullptr))) {
1361     // Second, look in our lists of empty and decommitted slot spans.
1362     // Check empty slot spans first, which are preferred, but beware that an
1363     // empty slot span might have been decommitted.
1364     while (PA_LIKELY((new_slot_span = empty_slot_spans_head) != nullptr)) {
1365       PA_DCHECK(new_slot_span->bucket == this);
1366       PA_DCHECK(new_slot_span->is_empty() || new_slot_span->is_decommitted());
1367       empty_slot_spans_head = new_slot_span->next_slot_span;
1368       // Accept the empty slot span unless it got decommitted.
1369       if (new_slot_span->get_freelist_head()) {
1370         new_slot_span->next_slot_span = nullptr;
1371         new_slot_span->ToSuperPageExtent()
1372             ->IncrementNumberOfNonemptySlotSpans();
1373 
1374         // Re-activating an empty slot span, update accounting.
1375         size_t dirty_size = base::bits::AlignUp(
1376             new_slot_span->GetProvisionedSize(), SystemPageSize());
1377         PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
1378         root->empty_slot_spans_dirty_bytes -= dirty_size;
1379 
1380         break;
1381       }
1382       PA_DCHECK(new_slot_span->is_decommitted());
1383       new_slot_span->next_slot_span = decommitted_slot_spans_head;
1384       decommitted_slot_spans_head = new_slot_span;
1385     }
1386     if (PA_UNLIKELY(!new_slot_span) &&
1387         PA_LIKELY(decommitted_slot_spans_head != nullptr)) {
1388       // Commit can be expensive, don't do it.
1389       if (flags & AllocFlags::kFastPathOrReturnNull) {
1390         return 0;
1391       }
1392 
1393       new_slot_span = decommitted_slot_spans_head;
1394       PA_DCHECK(new_slot_span->bucket == this);
1395       PA_DCHECK(new_slot_span->is_decommitted());
1396       decommitted_slot_spans_head = new_slot_span->next_slot_span;
1397 
1398       // If lazy commit is enabled, pages will be recommitted when provisioning
1399       // slots, in ProvisionMoreSlotsAndAllocOne(), not here.
1400       if (!kUseLazyCommit) {
1401         uintptr_t slot_span_start =
1402             SlotSpanMetadata<thread_safe>::ToSlotSpanStart(new_slot_span);
1403         // Since lazy commit isn't used, we have a guarantee that all slot span
1404         // pages have been previously committed, and then decommitted using
1405         // PageAccessibilityDisposition::kAllowKeepForPerf, so use the
1406         // same option as an optimization.
1407         // TODO(lizeb): Handle commit failure.
1408         root->RecommitSystemPagesForData(
1409             slot_span_start, new_slot_span->bucket->get_bytes_per_span(),
1410             PageAccessibilityDisposition::kAllowKeepForPerf);
1411       }
1412 
1413       new_slot_span->Reset();
1414       *is_already_zeroed = DecommittedMemoryIsAlwaysZeroed();
1415     }
1416     PA_DCHECK(new_slot_span);
1417   } else {
1418     // Getting a new slot span is expensive, don't do it.
1419     if (flags & AllocFlags::kFastPathOrReturnNull) {
1420       return 0;
1421     }
1422 
1423     // Third. If we get here, we need a brand new slot span.
1424     // TODO(bartekn): For single-slot slot spans, we can use rounded raw_size
1425     // as slot_span_committed_size.
1426     new_slot_span = AllocNewSlotSpan(root, flags, slot_span_alignment);
1427     // New memory from PageAllocator is always zeroed.
1428     *is_already_zeroed = true;
1429   }
1430 
1431   // Bail if we had a memory allocation failure.
1432   if (PA_UNLIKELY(!new_slot_span)) {
1433     PA_DCHECK(active_slot_spans_head ==
1434               SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
1435     if (flags & AllocFlags::kReturnNull) {
1436       return 0;
1437     }
1438     // See comment in PartitionDirectMap() for unlocking.
1439     ScopedUnlockGuard unlock{root->lock_};
1440     root->OutOfMemory(raw_size);
1441     PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
1442   }
1443 
1444   PA_DCHECK(new_bucket != &root->sentinel_bucket);
1445   new_bucket->active_slot_spans_head = new_slot_span;
1446   if (new_slot_span->CanStoreRawSize()) {
1447     new_slot_span->SetRawSize(raw_size);
1448   }
1449 
1450   // If we found an active slot span with free slots, or an empty slot span, we
1451   // have a usable freelist head.
1452   if (PA_LIKELY(new_slot_span->get_freelist_head() != nullptr)) {
1453     PartitionFreelistEntry* entry =
1454         new_slot_span->PopForAlloc(new_bucket->slot_size);
1455 
1456     // We may have set *is_already_zeroed to true above, make sure that the
1457     // freelist entry doesn't contain data. Either way, it wouldn't be a good
1458     // idea to let users see our internal data.
1459     uintptr_t slot_start = entry->ClearForAllocation();
1460     return slot_start;
1461   }
1462 
1463   // Otherwise, we need to provision more slots by committing more pages. Build
1464   // the free list for the newly provisioned slots.
1465   PA_DCHECK(new_slot_span->num_unprovisioned_slots);
1466   return ProvisionMoreSlotsAndAllocOne(root, new_slot_span);
1467 }
1468 
1469 template <bool thread_safe>
AllocNewSuperPageSpanForGwpAsan(PartitionRoot<thread_safe> * root,size_t super_page_count,unsigned int flags)1470 uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPageSpanForGwpAsan(
1471     PartitionRoot<thread_safe>* root,
1472     size_t super_page_count,
1473     unsigned int flags) {
1474   return AllocNewSuperPageSpan(root, super_page_count, flags);
1475 }
1476 
1477 template <bool thread_safe>
InitializeSlotSpanForGwpAsan(SlotSpanMetadata<thread_safe> * slot_span)1478 void PartitionBucket<thread_safe>::InitializeSlotSpanForGwpAsan(
1479     SlotSpanMetadata<thread_safe>* slot_span) {
1480   InitializeSlotSpan(slot_span);
1481 }
1482 
1483 template struct PartitionBucket<ThreadSafe>;
1484 
1485 }  // namespace partition_alloc::internal
1486