• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "partition_alloc/partition_bucket.h"
6 
7 #include <algorithm>
8 #include <bit>
9 #include <cstdint>
10 #include <tuple>
11 
12 #include "build/build_config.h"
13 #include "partition_alloc/address_pool_manager.h"
14 #include "partition_alloc/freeslot_bitmap.h"
15 #include "partition_alloc/freeslot_bitmap_constants.h"
16 #include "partition_alloc/oom.h"
17 #include "partition_alloc/page_allocator.h"
18 #include "partition_alloc/page_allocator_constants.h"
19 #include "partition_alloc/partition_address_space.h"
20 #include "partition_alloc/partition_alloc.h"
21 #include "partition_alloc/partition_alloc_base/bits.h"
22 #include "partition_alloc/partition_alloc_base/compiler_specific.h"
23 #include "partition_alloc/partition_alloc_base/component_export.h"
24 #include "partition_alloc/partition_alloc_base/debug/alias.h"
25 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
26 #include "partition_alloc/partition_alloc_base/immediate_crash.h"
27 #include "partition_alloc/partition_alloc_base/thread_annotations.h"
28 #include "partition_alloc/partition_alloc_buildflags.h"
29 #include "partition_alloc/partition_alloc_check.h"
30 #include "partition_alloc/partition_alloc_config.h"
31 #include "partition_alloc/partition_alloc_constants.h"
32 #include "partition_alloc/partition_alloc_forward.h"
33 #include "partition_alloc/partition_direct_map_extent.h"
34 #include "partition_alloc/partition_oom.h"
35 #include "partition_alloc/partition_page.h"
36 #include "partition_alloc/reservation_offset_table.h"
37 #include "partition_alloc/tagging.h"
38 
39 #if BUILDFLAG(USE_STARSCAN)
40 #include "partition_alloc/starscan/pcscan.h"
41 #endif
42 
43 namespace partition_alloc::internal {
44 
45 namespace {
46 
47 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
ShadowMetadataStart(uintptr_t super_page,pool_handle pool)48 PA_ALWAYS_INLINE uintptr_t ShadowMetadataStart(uintptr_t super_page,
49                                                pool_handle pool) {
50   uintptr_t shadow_metadata_start =
51       super_page + SystemPageSize() + ShadowPoolOffset(pool);
52   PA_DCHECK(!PartitionAddressSpace::IsInRegularPool(shadow_metadata_start));
53   PA_DCHECK(!PartitionAddressSpace::IsInBRPPool(shadow_metadata_start));
54   return shadow_metadata_start;
55 }
56 #endif
57 
PartitionOutOfMemoryMappingFailure(PartitionRoot * root,size_t size)58 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryMappingFailure(
59     PartitionRoot* root,
60     size_t size) PA_LOCKS_EXCLUDED(PartitionRootLock(root)) {
61   PA_NO_CODE_FOLDING();
62   root->OutOfMemory(size);
63   PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
64 }
65 
PartitionOutOfMemoryCommitFailure(PartitionRoot * root,size_t size)66 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryCommitFailure(
67     PartitionRoot* root,
68     size_t size) PA_LOCKS_EXCLUDED(PartitionRootLock(root)) {
69   PA_NO_CODE_FOLDING();
70   root->OutOfMemory(size);
71   PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
72 }
73 
74 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
75 // |start| has to be aligned to kSuperPageSize, but |end| doesn't. This means
76 // that a partial super page is allowed at the end. Since the block list uses
77 // kSuperPageSize granularity, a partial super page is considered blocked if
78 // there is a raw_ptr<T> pointing anywhere in that super page, even if doesn't
79 // point to that partially allocated region.
AreAllowedSuperPagesForBRPPool(uintptr_t start,uintptr_t end)80 bool AreAllowedSuperPagesForBRPPool(uintptr_t start, uintptr_t end) {
81   PA_DCHECK(!(start % kSuperPageSize));
82   for (uintptr_t super_page = start; super_page < end;
83        super_page += kSuperPageSize) {
84     // If any blocked super page is found inside the given memory region,
85     // the memory region is blocked.
86     if (!AddressPoolManagerBitmap::IsAllowedSuperPageForBRPPool(super_page)) {
87       AddressPoolManagerBitmap::IncrementBlocklistHitCount();
88       return false;
89     }
90   }
91   return true;
92 }
93 #endif  // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
94         // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
95 
96 // Reserves |requested_size| worth of super pages from the specified pool.
97 // If BRP pool is requested this function will honor BRP block list.
98 //
99 // The returned address will be aligned to kSuperPageSize, and so
100 // |requested_address| should be. |requested_size| doesn't have to be, however.
101 //
102 // |requested_address| is merely a hint, which will be attempted, but easily
103 // given up on if doesn't work the first time.
104 //
105 // The function doesn't need to hold root->lock_ or any other locks, because:
106 // - It (1) reserves memory, (2) then consults AreAllowedSuperPagesForBRPPool
107 //   for that memory, and (3) returns the memory if
108 //   allowed, or unreserves and decommits if not allowed. So no other
109 //   overlapping region can be allocated while executing
110 //   AreAllowedSuperPagesForBRPPool.
111 // - IsAllowedSuperPageForBRPPool (used by AreAllowedSuperPagesForBRPPool) is
112 //   designed to not need locking.
ReserveMemoryFromPool(pool_handle pool,uintptr_t requested_address,size_t requested_size)113 uintptr_t ReserveMemoryFromPool(pool_handle pool,
114                                 uintptr_t requested_address,
115                                 size_t requested_size) {
116   PA_DCHECK(!(requested_address % kSuperPageSize));
117 
118   uintptr_t reserved_address = AddressPoolManager::GetInstance().Reserve(
119       pool, requested_address, requested_size);
120 
121   // In 32-bit mode, when allocating from BRP pool, verify that the requested
122   // allocation honors the block list. Find a better address otherwise.
123 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
124   if (pool == kBRPPoolHandle) {
125     constexpr int kMaxRandomAddressTries = 10;
126     for (int i = 0; i < kMaxRandomAddressTries; ++i) {
127       if (!reserved_address ||
128           AreAllowedSuperPagesForBRPPool(reserved_address,
129                                          reserved_address + requested_size)) {
130         break;
131       }
132       AddressPoolManager::GetInstance().UnreserveAndDecommit(
133           pool, reserved_address, requested_size);
134       // No longer try to honor |requested_address|, because it didn't work for
135       // us last time.
136       reserved_address =
137           AddressPoolManager::GetInstance().Reserve(pool, 0, requested_size);
138     }
139 
140     // If the allocation attempt succeeds, we will break out of the following
141     // loop immediately.
142     //
143     // Last resort: sequentially scan the whole 32-bit address space. The number
144     // of blocked super-pages should be very small, so we expect to practically
145     // never need to run the following code. Note that it may fail to find an
146     // available super page, e.g., when it becomes available after the scan
147     // passes through it, but we accept the risk.
148     for (uintptr_t address_to_try = kSuperPageSize; address_to_try != 0;
149          address_to_try += kSuperPageSize) {
150       if (!reserved_address ||
151           AreAllowedSuperPagesForBRPPool(reserved_address,
152                                          reserved_address + requested_size)) {
153         break;
154       }
155       AddressPoolManager::GetInstance().UnreserveAndDecommit(
156           pool, reserved_address, requested_size);
157       // Reserve() can return a different pointer than attempted.
158       reserved_address = AddressPoolManager::GetInstance().Reserve(
159           pool, address_to_try, requested_size);
160     }
161 
162     // If the loop ends naturally, the last allocated region hasn't been
163     // verified. Do it now.
164     if (reserved_address &&
165         !AreAllowedSuperPagesForBRPPool(reserved_address,
166                                         reserved_address + requested_size)) {
167       AddressPoolManager::GetInstance().UnreserveAndDecommit(
168           pool, reserved_address, requested_size);
169       reserved_address = 0;
170     }
171   }
172 #endif  // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
173         // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
174 
175 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
176   // Only mark the region as belonging to the pool after it has passed the
177   // blocklist check in order to avoid a potential race with destructing a
178   // raw_ptr<T> object that points to non-PA memory in another thread.
179   // If `MarkUsed` was called earlier, the other thread could incorrectly
180   // determine that the allocation had come form PartitionAlloc.
181   if (reserved_address) {
182     AddressPoolManager::GetInstance().MarkUsed(pool, reserved_address,
183                                                requested_size);
184   }
185 #endif
186 
187   PA_DCHECK(!(reserved_address % kSuperPageSize));
188   return reserved_address;
189 }
190 
PartitionDirectMap(PartitionRoot * root,AllocFlags flags,size_t raw_size,size_t slot_span_alignment)191 SlotSpanMetadata* PartitionDirectMap(PartitionRoot* root,
192                                      AllocFlags flags,
193                                      size_t raw_size,
194                                      size_t slot_span_alignment) {
195   PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
196             std::has_single_bit(slot_span_alignment));
197 
198   // No static EXCLUSIVE_LOCKS_REQUIRED(), as the checker doesn't understand
199   // scoped unlocking.
200   PartitionRootLock(root).AssertAcquired();
201 
202   const bool return_null = ContainsFlags(flags, AllocFlags::kReturnNull);
203   if (PA_UNLIKELY(raw_size > MaxDirectMapped())) {
204     if (return_null) {
205       return nullptr;
206     }
207 
208     // The lock is here to protect PA from:
209     // 1. Concurrent calls
210     // 2. Reentrant calls
211     //
212     // This is fine here however, as:
213     // 1. Concurrency: |PartitionRoot::OutOfMemory()| never returns, so the lock
214     //    will not be re-acquired, which would lead to acting on inconsistent
215     //    data that could have been modified in-between releasing and acquiring
216     //    it.
217     // 2. Reentrancy: This is why we release the lock. On some platforms,
218     //    terminating the process may free() memory, or even possibly try to
219     //    allocate some. Calling free() is fine, but will deadlock since
220     //    |PartitionRoot::lock_| is not recursive.
221     //
222     // Supporting reentrant calls properly is hard, and not a requirement for
223     // PA. However up to that point, we've only *read* data, not *written* to
224     // any state. Reentrant calls are then fine, especially as we don't continue
225     // on this path. The only downside is possibly endless recursion if the OOM
226     // handler allocates and fails to use UncheckedMalloc() or equivalent, but
227     // that's violating the contract of base::TerminateBecauseOutOfMemory().
228     ScopedUnlockGuard unlock{PartitionRootLock(root)};
229     PartitionExcessiveAllocationSize(raw_size);
230   }
231 
232   PartitionDirectMapExtent* map_extent = nullptr;
233   PartitionPage* page = nullptr;
234 
235   {
236     // Getting memory for direct-mapped allocations doesn't interact with the
237     // rest of the allocator, but takes a long time, as it involves several
238     // system calls. Although no mmap() (or equivalent) calls are made on
239     // 64 bit systems, page permissions are changed with mprotect(), which is
240     // a syscall.
241     //
242     // These calls are almost always slow (at least a couple us per syscall on a
243     // desktop Linux machine), and they also have a very long latency tail,
244     // possibly from getting descheduled. As a consequence, we should not hold
245     // the lock when performing a syscall. This is not the only problematic
246     // location, but since this one doesn't interact with the rest of the
247     // allocator, we can safely drop and then re-acquire the lock.
248     //
249     // Note that this only affects allocations that are not served out of the
250     // thread cache, but as a simple example the buffer partition in blink is
251     // frequently used for large allocations (e.g. ArrayBuffer), and frequent,
252     // small ones (e.g. WTF::String), and does not have a thread cache.
253     ScopedUnlockGuard scoped_unlock{PartitionRootLock(root)};
254 
255     const size_t slot_size = PartitionRoot::GetDirectMapSlotSize(raw_size);
256     // The super page starts with a partition page worth of metadata and guard
257     // pages, hence alignment requests ==PartitionPageSize() will be
258     // automatically satisfied. Padding is needed for higher-order alignment
259     // requests. Note, |slot_span_alignment| is at least 1 partition page.
260     const size_t padding_for_alignment =
261         slot_span_alignment - PartitionPageSize();
262     const size_t reservation_size = PartitionRoot::GetDirectMapReservationSize(
263         raw_size + padding_for_alignment);
264     PA_DCHECK(reservation_size >= raw_size);
265 #if BUILDFLAG(PA_DCHECK_IS_ON)
266     const size_t available_reservation_size =
267         reservation_size - padding_for_alignment -
268         PartitionRoot::GetDirectMapMetadataAndGuardPagesSize();
269     PA_DCHECK(slot_size <= available_reservation_size);
270 #endif
271 
272     pool_handle pool = root->ChoosePool();
273     uintptr_t reservation_start;
274     {
275       // Reserving memory from the pool is actually not a syscall on 64 bit
276       // platforms.
277 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
278       ScopedSyscallTimer timer{root};
279 #endif
280       reservation_start = ReserveMemoryFromPool(pool, 0, reservation_size);
281     }
282     if (PA_UNLIKELY(!reservation_start)) {
283       if (return_null) {
284         return nullptr;
285       }
286 
287       PartitionOutOfMemoryMappingFailure(root, reservation_size);
288     }
289 
290     root->total_size_of_direct_mapped_pages.fetch_add(
291         reservation_size, std::memory_order_relaxed);
292 
293     // Shift by 1 partition page (metadata + guard pages) and alignment padding.
294     const uintptr_t slot_start =
295         reservation_start + PartitionPageSize() + padding_for_alignment;
296 
297     {
298       ScopedSyscallTimer timer{root};
299       RecommitSystemPages(reservation_start + SystemPageSize(),
300                           SystemPageSize(),
301 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
302                           root->PageAccessibilityWithThreadIsolationIfEnabled(
303                               PageAccessibilityConfiguration::kRead),
304 #else
305                           root->PageAccessibilityWithThreadIsolationIfEnabled(
306                               PageAccessibilityConfiguration::kReadWrite),
307 #endif
308                           PageAccessibilityDisposition::kRequireUpdate);
309     }
310 
311 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
312     // If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
313     // used, allocate a SystemPage for RefCount "bitmap" (only one of its
314     // elements will be used).
315     if (pool == kBRPPoolHandle) {
316       ScopedSyscallTimer timer{root};
317       RecommitSystemPages(reservation_start + SystemPageSize() * 2,
318                           SystemPageSize(),
319                           root->PageAccessibilityWithThreadIsolationIfEnabled(
320                               PageAccessibilityConfiguration::kReadWrite),
321                           PageAccessibilityDisposition::kRequireUpdate);
322     }
323 #endif
324 
325 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
326     {
327       ScopedSyscallTimer timer{root};
328       RecommitSystemPages(ShadowMetadataStart(reservation_start, pool),
329                           SystemPageSize(),
330                           root->PageAccessibilityWithThreadIsolationIfEnabled(
331                               PageAccessibilityConfiguration::kReadWrite),
332                           PageAccessibilityDisposition::kRequireUpdate);
333     }
334 #endif
335 
336     // No need to hold root->lock_. Now that memory is reserved, no other
337     // overlapping region can be allocated (because of how pools work),
338     // so no other thread can update the same offset table entries at the
339     // same time. Furthermore, nobody will be ready these offsets until this
340     // function returns.
341     auto* offset_ptr = ReservationOffsetPointer(reservation_start);
342     [[maybe_unused]] const auto* offset_ptr_end =
343         GetReservationOffsetTableEnd(reservation_start);
344 
345     // |raw_size| > MaxBucketed(). So |reservation_size| > 0.
346     PA_DCHECK(reservation_size > 0);
347     const uint16_t offset_end = (reservation_size - 1) >> kSuperPageShift;
348     for (uint16_t offset = 0; offset <= offset_end; ++offset) {
349       PA_DCHECK(offset < kOffsetTagNormalBuckets);
350       PA_DCHECK(offset_ptr < offset_ptr_end);
351       *offset_ptr++ = offset;
352     }
353 
354     auto* super_page_extent = PartitionSuperPageToExtent(reservation_start);
355     super_page_extent->root = root;
356     // The new structures are all located inside a fresh system page so they
357     // will all be zeroed out. These DCHECKs are for documentation and to assert
358     // our expectations of the kernel.
359     PA_DCHECK(!super_page_extent->number_of_consecutive_super_pages);
360     PA_DCHECK(!super_page_extent->next);
361 
362     PartitionPage* first_page =
363         reinterpret_cast<PartitionPage*>(super_page_extent) + 1;
364     page = PartitionPage::FromAddr(slot_start);
365     // |first_page| and |page| may be equal, if there is no alignment padding.
366     if (page != first_page) {
367       PA_DCHECK(page > first_page);
368       PA_DCHECK(page - first_page <= PartitionPage::kMaxSlotSpanMetadataOffset);
369       PA_CHECK(!first_page->is_valid);
370       first_page->has_valid_span_after_this = true;
371       first_page->slot_span_metadata_offset = page - first_page;
372     }
373     auto* metadata = reinterpret_cast<PartitionDirectMapMetadata*>(page);
374     // Since direct map metadata is larger than PartitionPage, make sure the
375     // first and the last bytes are on the same system page, i.e. within the
376     // super page metadata region.
377     PA_DCHECK(base::bits::AlignDown(reinterpret_cast<uintptr_t>(metadata),
378                                     SystemPageSize()) ==
379               base::bits::AlignDown(reinterpret_cast<uintptr_t>(metadata) +
380                                         sizeof(PartitionDirectMapMetadata) - 1,
381                                     SystemPageSize()));
382     PA_DCHECK(page == &metadata->page);
383     page->is_valid = true;
384     PA_DCHECK(!page->has_valid_span_after_this);
385     PA_DCHECK(!page->slot_span_metadata_offset);
386     PA_DCHECK(!page->slot_span_metadata.next_slot_span);
387     PA_DCHECK(!page->slot_span_metadata.marked_full);
388     PA_DCHECK(!page->slot_span_metadata.num_allocated_slots);
389     PA_DCHECK(!page->slot_span_metadata.num_unprovisioned_slots);
390     PA_DCHECK(!page->slot_span_metadata.in_empty_cache());
391 
392     PA_DCHECK(!metadata->subsequent_page.subsequent_page_metadata.raw_size);
393     // Raw size is set later, by the caller.
394     metadata->subsequent_page.slot_span_metadata_offset = 1;
395 
396     PA_DCHECK(!metadata->bucket.active_slot_spans_head);
397     PA_DCHECK(!metadata->bucket.empty_slot_spans_head);
398     PA_DCHECK(!metadata->bucket.decommitted_slot_spans_head);
399     PA_DCHECK(!metadata->bucket.num_system_pages_per_slot_span);
400     PA_DCHECK(!metadata->bucket.num_full_slot_spans);
401     metadata->bucket.slot_size = slot_size;
402 
403     new (&page->slot_span_metadata) SlotSpanMetadata(&metadata->bucket);
404 
405     // It is typically possible to map a large range of inaccessible pages, and
406     // this is leveraged in multiple places, including the pools. However,
407     // this doesn't mean that we can commit all this memory.  For the vast
408     // majority of allocations, this just means that we crash in a slightly
409     // different place, but for callers ready to handle failures, we have to
410     // return nullptr. See crbug.com/1187404.
411     //
412     // Note that we didn't check above, because if we cannot even commit a
413     // single page, then this is likely hopeless anyway, and we will crash very
414     // soon.
415     //
416     // Direct map never uses tagging, as size is always >kMaxMemoryTaggingSize.
417     PA_DCHECK(raw_size > kMaxMemoryTaggingSize);
418     const bool ok = root->TryRecommitSystemPagesForDataWithAcquiringLock(
419         slot_start, slot_size, PageAccessibilityDisposition::kRequireUpdate,
420         false);
421     if (!ok) {
422       if (!return_null) {
423         PartitionOutOfMemoryCommitFailure(root, slot_size);
424       }
425 
426       {
427         ScopedSyscallTimer timer{root};
428 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
429         AddressPoolManager::GetInstance().MarkUnused(pool, reservation_start,
430                                                      reservation_size);
431 #endif
432         AddressPoolManager::GetInstance().UnreserveAndDecommit(
433             pool, reservation_start, reservation_size);
434       }
435 
436       root->total_size_of_direct_mapped_pages.fetch_sub(
437           reservation_size, std::memory_order_relaxed);
438 
439       return nullptr;
440     }
441 
442     auto* next_entry = EncodedNextFreelistEntry::EmplaceAndInitNull(slot_start);
443     page->slot_span_metadata.SetFreelistHead(next_entry);
444 
445     map_extent = &metadata->direct_map_extent;
446     map_extent->reservation_size = reservation_size;
447     map_extent->padding_for_alignment = padding_for_alignment;
448     map_extent->bucket = &metadata->bucket;
449   }
450 
451   PartitionRootLock(root).AssertAcquired();
452 
453   // Maintain the doubly-linked list of all direct mappings.
454   map_extent->next_extent = root->direct_map_list;
455   if (map_extent->next_extent) {
456     map_extent->next_extent->prev_extent = map_extent;
457   }
458   map_extent->prev_extent = nullptr;
459   root->direct_map_list = map_extent;
460 
461   return &page->slot_span_metadata;
462 }
463 
ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size)464 uint8_t ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size) {
465   if (slot_size > MaxRegularSlotSpanSize()) {
466     // This is technically not needed, as for now all the larger slot sizes are
467     // multiples of the system page size.
468     return base::bits::AlignUp(slot_size, SystemPageSize()) / SystemPageSize();
469   }
470 
471   // Smaller slot spans waste less address space, as well as potentially lower
472   // fragmentation:
473   // - Address space: This comes from fuller SuperPages (since the tail end of a
474   //   SuperPage is more likely to be used when the slot span is smaller. Also,
475   //   if a slot span is partially used, a smaller slot span will use less
476   //   address space.
477   // - In-slot fragmentation: Slot span management code will prioritize
478   //   almost-full slot spans, as well as trying to keep empty slot spans
479   //   empty. The more granular this logic can work, the better.
480   //
481   // Since metadata space overhead is constant per-PartitionPage, keeping
482   // smaller slot spans makes sense.
483   //
484   // Underlying memory allocation is done per-PartitionPage, but memory commit
485   // is done per system page. This means that we prefer to fill the entirety of
486   // a PartitionPage with a slot span, but we can tolerate some system pages
487   // being empty at the end, as these will not cost committed or dirty memory.
488   //
489   // The choice below is, for multi-slot slot spans:
490   // - If a full PartitionPage slot span is possible with less than 2% of a
491   //   *single* system page wasted, use it. The smallest possible size wins.
492   // - Otherwise, select the size with the smallest virtual address space
493   //   loss. Allow a SlotSpan to leave some slack in its PartitionPage, up to
494   //   1/4 of the total.
495   for (size_t partition_page_count = 1;
496        partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
497        partition_page_count++) {
498     size_t candidate_size = partition_page_count * PartitionPageSize();
499     size_t waste = candidate_size % slot_size;
500     if (waste <= .02 * SystemPageSize()) {
501       return partition_page_count * NumSystemPagesPerPartitionPage();
502     }
503   }
504 
505   size_t best_count = 0;
506   size_t best_waste = std::numeric_limits<size_t>::max();
507   for (size_t partition_page_count = 1;
508        partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
509        partition_page_count++) {
510     // Prefer no slack.
511     for (size_t slack = 0; slack < partition_page_count; slack++) {
512       size_t system_page_count =
513           partition_page_count * NumSystemPagesPerPartitionPage() - slack;
514       size_t candidate_size = system_page_count * SystemPageSize();
515       size_t waste = candidate_size % slot_size;
516       if (waste < best_waste) {
517         best_waste = waste;
518         best_count = system_page_count;
519       }
520     }
521   }
522   return best_count;
523 }
524 
ComputeSystemPagesPerSlotSpanInternal(size_t slot_size)525 uint8_t ComputeSystemPagesPerSlotSpanInternal(size_t slot_size) {
526   // This works out reasonably for the current bucket sizes of the generic
527   // allocator, and the current values of partition page size and constants.
528   // Specifically, we have enough room to always pack the slots perfectly into
529   // some number of system pages. The only waste is the waste associated with
530   // unfaulted pages (i.e. wasted address space).
531   // TODO: we end up using a lot of system pages for very small sizes. For
532   // example, we'll use 12 system pages for slot size 24. The slot size is so
533   // small that the waste would be tiny with just 4, or 1, system pages.  Later,
534   // we can investigate whether there are anti-fragmentation benefits to using
535   // fewer system pages.
536   double best_waste_ratio = 1.0f;
537   uint16_t best_pages = 0;
538   if (slot_size > MaxRegularSlotSpanSize()) {
539     // TODO(ajwong): Why is there a DCHECK here for this?
540     // http://crbug.com/776537
541     PA_DCHECK(!(slot_size % SystemPageSize()));
542     best_pages = static_cast<uint16_t>(slot_size >> SystemPageShift());
543     PA_CHECK(best_pages <= std::numeric_limits<uint8_t>::max());
544     return static_cast<uint8_t>(best_pages);
545   }
546   PA_DCHECK(slot_size <= MaxRegularSlotSpanSize());
547   for (uint16_t i = NumSystemPagesPerPartitionPage() - 1;
548        i <= MaxSystemPagesPerRegularSlotSpan(); ++i) {
549     size_t page_size = i << SystemPageShift();
550     size_t num_slots = page_size / slot_size;
551     size_t waste = page_size - (num_slots * slot_size);
552     // Leaving a page unfaulted is not free; the page will occupy an empty page
553     // table entry.  Make a simple attempt to account for that.
554     //
555     // TODO(ajwong): This looks wrong. PTEs are allocated for all pages
556     // regardless of whether or not they are wasted. Should it just
557     // be waste += i * sizeof(void*)?
558     // http://crbug.com/776537
559     size_t num_remainder_pages = i & (NumSystemPagesPerPartitionPage() - 1);
560     size_t num_unfaulted_pages =
561         num_remainder_pages
562             ? (NumSystemPagesPerPartitionPage() - num_remainder_pages)
563             : 0;
564     waste += sizeof(void*) * num_unfaulted_pages;
565     double waste_ratio =
566         static_cast<double>(waste) / static_cast<double>(page_size);
567     if (waste_ratio < best_waste_ratio) {
568       best_waste_ratio = waste_ratio;
569       best_pages = i;
570     }
571   }
572   PA_DCHECK(best_pages > 0);
573   PA_CHECK(best_pages <= MaxSystemPagesPerRegularSlotSpan());
574   return static_cast<uint8_t>(best_pages);
575 }
576 
577 #if PA_CONFIG(HAS_MEMORY_TAGGING)
578 // Returns size that should be tagged. Avoiding the previous slot ref count if
579 // it exists to avoid a race (crbug.com/1445816).
TagSizeForSlot(PartitionRoot * root,size_t slot_size)580 PA_ALWAYS_INLINE size_t TagSizeForSlot(PartitionRoot* root, size_t slot_size) {
581 #if PA_CONFIG(INCREASE_REF_COUNT_SIZE_FOR_MTE)
582   return slot_size - root->settings.ref_count_size;
583 #else
584   return slot_size;
585 #endif
586 }
587 #endif  // PA_CONFIG(HAS_MEMORY_TAGGING)
588 
589 }  // namespace
590 
ComputeSystemPagesPerSlotSpan(size_t slot_size,bool prefer_smaller_slot_spans)591 uint8_t ComputeSystemPagesPerSlotSpan(size_t slot_size,
592                                       bool prefer_smaller_slot_spans) {
593   if (prefer_smaller_slot_spans) {
594     size_t system_page_count =
595         ComputeSystemPagesPerSlotSpanPreferSmall(slot_size);
596     size_t waste = (system_page_count * SystemPageSize()) % slot_size;
597     // In case the waste is too large (more than 5% of a page), don't try to use
598     // the "small" slot span formula. This happens when we have a lot of
599     // buckets, in some cases the formula doesn't find a nice, small size.
600     if (waste <= .05 * SystemPageSize()) {
601       return system_page_count;
602     }
603   }
604 
605   return ComputeSystemPagesPerSlotSpanInternal(slot_size);
606 }
607 
Init(uint32_t new_slot_size)608 void PartitionBucket::Init(uint32_t new_slot_size) {
609   slot_size = new_slot_size;
610   slot_size_reciprocal = kReciprocalMask / new_slot_size + 1;
611   active_slot_spans_head = SlotSpanMetadata::get_sentinel_slot_span_non_const();
612   empty_slot_spans_head = nullptr;
613   decommitted_slot_spans_head = nullptr;
614   num_full_slot_spans = 0;
615   bool prefer_smaller_slot_spans =
616 #if PA_CONFIG(PREFER_SMALLER_SLOT_SPANS)
617       true
618 #else
619       false
620 #endif
621       ;
622   num_system_pages_per_slot_span =
623       ComputeSystemPagesPerSlotSpan(slot_size, prefer_smaller_slot_spans);
624 }
625 
AllocNewSlotSpan(PartitionRoot * root,AllocFlags flags,size_t slot_span_alignment)626 PA_ALWAYS_INLINE SlotSpanMetadata* PartitionBucket::AllocNewSlotSpan(
627     PartitionRoot* root,
628     AllocFlags flags,
629     size_t slot_span_alignment) {
630   PA_DCHECK(!(root->next_partition_page % PartitionPageSize()));
631   PA_DCHECK(!(root->next_partition_page_end % PartitionPageSize()));
632 
633   size_t num_partition_pages = get_pages_per_slot_span();
634   size_t slot_span_reservation_size = num_partition_pages
635                                       << PartitionPageShift();
636   size_t slot_span_committed_size = get_bytes_per_span();
637   PA_DCHECK(num_partition_pages <= NumPartitionPagesPerSuperPage());
638   PA_DCHECK(slot_span_committed_size % SystemPageSize() == 0);
639   PA_DCHECK(slot_span_committed_size <= slot_span_reservation_size);
640 
641   uintptr_t adjusted_next_partition_page =
642       base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
643   if (PA_UNLIKELY(adjusted_next_partition_page + slot_span_reservation_size >
644                   root->next_partition_page_end)) {
645     // AllocNewSuperPage() may crash (e.g. address space exhaustion), put data
646     // on stack.
647     PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
648     PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
649 
650     // In this case, we can no longer hand out pages from the current super page
651     // allocation. Get a new super page.
652     if (!AllocNewSuperPage(root, flags)) {
653       return nullptr;
654     }
655     // AllocNewSuperPage() updates root->next_partition_page, re-query.
656     adjusted_next_partition_page =
657         base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
658     PA_CHECK(adjusted_next_partition_page + slot_span_reservation_size <=
659              root->next_partition_page_end);
660   }
661 
662   auto* gap_start_page = PartitionPage::FromAddr(root->next_partition_page);
663   auto* gap_end_page = PartitionPage::FromAddr(adjusted_next_partition_page);
664   for (auto* page = gap_start_page; page < gap_end_page; ++page) {
665     PA_DCHECK(!page->is_valid);
666     page->has_valid_span_after_this = 1;
667   }
668   root->next_partition_page =
669       adjusted_next_partition_page + slot_span_reservation_size;
670 
671   uintptr_t slot_span_start = adjusted_next_partition_page;
672   auto* slot_span = &gap_end_page->slot_span_metadata;
673   InitializeSlotSpan(slot_span);
674   // Now that slot span is initialized, it's safe to call FromSlotStart.
675   PA_DCHECK(slot_span == SlotSpanMetadata::FromSlotStart(slot_span_start));
676 
677   // System pages in the super page come in a decommited state. Commit them
678   // before vending them back.
679   // If lazy commit is enabled, pages will be committed when provisioning slots,
680   // in ProvisionMoreSlotsAndAllocOne(), not here.
681   if (!kUseLazyCommit) {
682     PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
683     PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
684     PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
685 
686     root->RecommitSystemPagesForData(
687         slot_span_start, slot_span_committed_size,
688         PageAccessibilityDisposition::kRequireUpdate,
689         slot_size <= kMaxMemoryTaggingSize);
690   }
691 
692   PA_CHECK(get_slots_per_span() <= kMaxSlotsPerSlotSpan);
693 
694   // Double check that we had enough space in the super page for the new slot
695   // span.
696   PA_DCHECK(root->next_partition_page <= root->next_partition_page_end);
697 
698   return slot_span;
699 }
700 
AllocNewSuperPageSpan(PartitionRoot * root,size_t super_page_count,AllocFlags flags)701 uintptr_t PartitionBucket::AllocNewSuperPageSpan(PartitionRoot* root,
702                                                  size_t super_page_count,
703                                                  AllocFlags flags) {
704   PA_CHECK(super_page_count > 0);
705   PA_CHECK(super_page_count <=
706            std::numeric_limits<size_t>::max() / kSuperPageSize);
707   // Need a new super page. We want to allocate super pages in a contiguous
708   // address region as much as possible. This is important for not causing
709   // page table bloat and not fragmenting address spaces in 32 bit
710   // architectures.
711   uintptr_t requested_address = root->next_super_page;
712   pool_handle pool = root->ChoosePool();
713   uintptr_t super_page_span_start = ReserveMemoryFromPool(
714       pool, requested_address, super_page_count * kSuperPageSize);
715   if (PA_UNLIKELY(!super_page_span_start)) {
716     if (ContainsFlags(flags, AllocFlags::kReturnNull)) {
717       return 0;
718     }
719 
720     // Didn't manage to get a new uncommitted super page -> address space issue.
721     ::partition_alloc::internal::ScopedUnlockGuard unlock{
722         PartitionRootLock(root)};
723     PartitionOutOfMemoryMappingFailure(root, kSuperPageSize);
724   }
725 
726   uintptr_t super_page_span_end =
727       super_page_span_start + super_page_count * kSuperPageSize;
728   for (uintptr_t super_page = super_page_span_start;
729        super_page < super_page_span_end; super_page += kSuperPageSize) {
730     InitializeSuperPage(root, super_page, 0);
731   }
732   return super_page_span_start;
733 }
734 
735 PA_ALWAYS_INLINE uintptr_t
AllocNewSuperPage(PartitionRoot * root,AllocFlags flags)736 PartitionBucket::AllocNewSuperPage(PartitionRoot* root, AllocFlags flags) {
737   auto super_page = AllocNewSuperPageSpan(root, 1, flags);
738   if (PA_UNLIKELY(!super_page)) {
739     // If the `kReturnNull` flag isn't set and the allocation attempt fails,
740     // `AllocNewSuperPageSpan` should've failed with an OOM crash.
741     PA_DCHECK(ContainsFlags(flags, AllocFlags::kReturnNull));
742     return 0;
743   }
744   return SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed());
745 }
746 
747 PA_ALWAYS_INLINE uintptr_t
InitializeSuperPage(PartitionRoot * root,uintptr_t super_page,uintptr_t requested_address)748 PartitionBucket::InitializeSuperPage(PartitionRoot* root,
749                                      uintptr_t super_page,
750                                      uintptr_t requested_address) {
751   *ReservationOffsetPointer(super_page) = kOffsetTagNormalBuckets;
752 
753   root->total_size_of_super_pages.fetch_add(kSuperPageSize,
754                                             std::memory_order_relaxed);
755 
756   root->next_super_page = super_page + kSuperPageSize;
757   uintptr_t state_bitmap =
758       super_page + PartitionPageSize() +
759       (is_direct_mapped() ? 0 : ReservedFreeSlotBitmapSize());
760 #if BUILDFLAG(USE_STARSCAN)
761   PA_DCHECK(SuperPageStateBitmapAddr(super_page) == state_bitmap);
762   const size_t state_bitmap_reservation_size =
763       root->IsQuarantineAllowed() ? ReservedStateBitmapSize() : 0;
764   const size_t state_bitmap_size_to_commit =
765       root->IsQuarantineAllowed() ? CommittedStateBitmapSize() : 0;
766   PA_DCHECK(state_bitmap_reservation_size % PartitionPageSize() == 0);
767   PA_DCHECK(state_bitmap_size_to_commit % SystemPageSize() == 0);
768   PA_DCHECK(state_bitmap_size_to_commit <= state_bitmap_reservation_size);
769   uintptr_t payload = state_bitmap + state_bitmap_reservation_size;
770 #else
771   uintptr_t payload = state_bitmap;
772 #endif  // BUILDFLAG(USE_STARSCAN)
773 
774   root->next_partition_page = payload;
775   root->next_partition_page_end = root->next_super_page - PartitionPageSize();
776   PA_DCHECK(payload ==
777             SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed()));
778   PA_DCHECK(root->next_partition_page_end == SuperPagePayloadEnd(super_page));
779 
780   // Keep the first partition page in the super page inaccessible to serve as a
781   // guard page, except an "island" in the middle where we put page metadata and
782   // also a tiny amount of extent metadata.
783   {
784     ScopedSyscallTimer timer{root};
785     RecommitSystemPages(super_page + SystemPageSize(), SystemPageSize(),
786 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
787                         root->PageAccessibilityWithThreadIsolationIfEnabled(
788                             PageAccessibilityConfiguration::kRead),
789 #else
790                         root->PageAccessibilityWithThreadIsolationIfEnabled(
791                             PageAccessibilityConfiguration::kReadWrite),
792 #endif
793                         PageAccessibilityDisposition::kRequireUpdate);
794   }
795 
796 #if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
797   // If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
798   // used, allocate a SystemPage for RefCount bitmap.
799   if (root->ChoosePool() == kBRPPoolHandle) {
800     ScopedSyscallTimer timer{root};
801     RecommitSystemPages(super_page + SystemPageSize() * 2, SystemPageSize(),
802                         root->PageAccessibilityWithThreadIsolationIfEnabled(
803                             PageAccessibilityConfiguration::kReadWrite),
804                         PageAccessibilityDisposition::kRequireUpdate);
805   }
806 #endif
807 
808 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
809   {
810     ScopedSyscallTimer timer{root};
811     RecommitSystemPages(ShadowMetadataStart(super_page, root->ChoosePool()),
812                         SystemPageSize(),
813                         root->PageAccessibilityWithThreadIsolationIfEnabled(
814                             PageAccessibilityConfiguration::kReadWrite),
815                         PageAccessibilityDisposition::kRequireUpdate);
816   }
817 #endif
818 
819   // If we were after a specific address, but didn't get it, assume that
820   // the system chose a lousy address. Here most OS'es have a default
821   // algorithm that isn't randomized. For example, most Linux
822   // distributions will allocate the mapping directly before the last
823   // successful mapping, which is far from random. So we just get fresh
824   // randomness for the next mapping attempt.
825   if (requested_address && requested_address != super_page) {
826     root->next_super_page = 0;
827   }
828 
829   // We allocated a new super page so update super page metadata.
830   // First check if this is a new extent or not.
831   auto* latest_extent = PartitionSuperPageToExtent(super_page);
832   // By storing the root in every extent metadata object, we have a fast way
833   // to go from a pointer within the partition to the root object.
834   latest_extent->root = root;
835   // Most new extents will be part of a larger extent, and these two fields
836   // are unused, but we initialize them to 0 so that we get a clear signal
837   // in case they are accidentally used.
838   latest_extent->number_of_consecutive_super_pages = 0;
839   latest_extent->next = nullptr;
840   latest_extent->number_of_nonempty_slot_spans = 0;
841 
842   PartitionSuperPageExtentEntry* current_extent = root->current_extent;
843   const bool is_new_extent = super_page != requested_address;
844   if (PA_UNLIKELY(is_new_extent)) {
845     if (PA_UNLIKELY(!current_extent)) {
846       PA_DCHECK(!root->first_extent);
847       root->first_extent = latest_extent;
848     } else {
849       PA_DCHECK(current_extent->number_of_consecutive_super_pages);
850       current_extent->next = latest_extent;
851     }
852     root->current_extent = latest_extent;
853     latest_extent->number_of_consecutive_super_pages = 1;
854   } else {
855     // We allocated next to an existing extent so just nudge the size up a
856     // little.
857     PA_DCHECK(current_extent->number_of_consecutive_super_pages);
858     ++current_extent->number_of_consecutive_super_pages;
859     PA_DCHECK(payload > SuperPagesBeginFromExtent(current_extent) &&
860               payload < SuperPagesEndFromExtent(current_extent));
861   }
862 
863   // If PCScan is used, commit the state bitmap. Otherwise, leave it uncommitted
864   // and let PartitionRoot::RegisterScannableRoot() commit it when needed. Make
865   // sure to register the super-page after it has been fully initialized.
866   // Otherwise, the concurrent scanner may try to access |extent->root| which
867   // could be not initialized yet.
868 #if BUILDFLAG(USE_STARSCAN)
869   if (root->IsQuarantineEnabled()) {
870     {
871       ScopedSyscallTimer timer{root};
872       RecommitSystemPages(state_bitmap, state_bitmap_size_to_commit,
873                           root->PageAccessibilityWithThreadIsolationIfEnabled(
874                               PageAccessibilityConfiguration::kReadWrite),
875                           PageAccessibilityDisposition::kRequireUpdate);
876     }
877     PCScan::RegisterNewSuperPage(root, super_page);
878   }
879 #endif  // BUILDFLAG(USE_STARSCAN)
880 
881 #if BUILDFLAG(USE_FREESLOT_BITMAP)
882   // Commit the pages for freeslot bitmap.
883   if (!is_direct_mapped()) {
884     uintptr_t freeslot_bitmap_addr = super_page + PartitionPageSize();
885     PA_DCHECK(SuperPageFreeSlotBitmapAddr(super_page) == freeslot_bitmap_addr);
886     ScopedSyscallTimer timer{root};
887     RecommitSystemPages(freeslot_bitmap_addr, CommittedFreeSlotBitmapSize(),
888                         root->PageAccessibilityWithThreadIsolationIfEnabled(
889                             PageAccessibilityConfiguration::kReadWrite),
890                         PageAccessibilityDisposition::kRequireUpdate);
891   }
892 #endif
893 
894   return payload;
895 }
896 
InitializeSlotSpan(SlotSpanMetadata * slot_span)897 PA_ALWAYS_INLINE void PartitionBucket::InitializeSlotSpan(
898     SlotSpanMetadata* slot_span) {
899   new (slot_span) SlotSpanMetadata(this);
900 
901   slot_span->Reset();
902 
903   uint16_t num_partition_pages = get_pages_per_slot_span();
904   auto* page = reinterpret_cast<PartitionPage*>(slot_span);
905   for (uint16_t i = 0; i < num_partition_pages; ++i, ++page) {
906     PA_DCHECK(i <= PartitionPage::kMaxSlotSpanMetadataOffset);
907     page->slot_span_metadata_offset = i;
908     page->is_valid = true;
909   }
910 }
911 
912 PA_ALWAYS_INLINE uintptr_t
ProvisionMoreSlotsAndAllocOne(PartitionRoot * root,AllocFlags flags,SlotSpanMetadata * slot_span)913 PartitionBucket::ProvisionMoreSlotsAndAllocOne(PartitionRoot* root,
914                                                AllocFlags flags,
915                                                SlotSpanMetadata* slot_span) {
916   PA_DCHECK(slot_span != SlotSpanMetadata::get_sentinel_slot_span());
917   size_t num_slots = slot_span->num_unprovisioned_slots;
918   PA_DCHECK(num_slots);
919   PA_DCHECK(num_slots <= get_slots_per_span());
920   // We should only get here when _every_ slot is either used or unprovisioned.
921   // (The third possible state is "on the freelist". If we have a non-empty
922   // freelist, we should not get here.)
923   PA_DCHECK(num_slots + slot_span->num_allocated_slots == get_slots_per_span());
924   // Similarly, make explicitly sure that the freelist is empty.
925   PA_DCHECK(!slot_span->get_freelist_head());
926   PA_DCHECK(!slot_span->is_full());
927 
928   uintptr_t slot_span_start = SlotSpanMetadata::ToSlotSpanStart(slot_span);
929   // If we got here, the first unallocated slot is either partially or fully on
930   // an uncommitted page. If the latter, it must be at the start of that page.
931   uintptr_t return_slot =
932       slot_span_start + (slot_size * slot_span->num_allocated_slots);
933   uintptr_t next_slot = return_slot + slot_size;
934   uintptr_t commit_start = base::bits::AlignUp(return_slot, SystemPageSize());
935   PA_DCHECK(next_slot > commit_start);
936   uintptr_t commit_end = base::bits::AlignUp(next_slot, SystemPageSize());
937   // If the slot was partially committed, |return_slot| and |next_slot| fall
938   // in different pages. If the slot was fully uncommitted, |return_slot| points
939   // to the page start and |next_slot| doesn't, thus only the latter gets
940   // rounded up.
941   PA_DCHECK(commit_end > commit_start);
942 
943   // If lazy commit is enabled, meaning system pages in the slot span come
944   // in an initially decommitted state, commit them here.
945   // Note, we can't use PageAccessibilityDisposition::kAllowKeepForPerf, because
946   // we have no knowledge which pages have been committed before (it doesn't
947   // matter on Windows anyway).
948   if (kUseLazyCommit) {
949     const bool ok = root->TryRecommitSystemPagesForDataLocked(
950         commit_start, commit_end - commit_start,
951         PageAccessibilityDisposition::kRequireUpdate,
952         slot_size <= kMaxMemoryTaggingSize);
953     if (!ok) {
954       if (!ContainsFlags(flags, AllocFlags::kReturnNull)) {
955         ScopedUnlockGuard unlock{PartitionRootLock(root)};
956         PartitionOutOfMemoryCommitFailure(root, slot_size);
957       }
958       return 0;
959     }
960   }
961 
962   // The slot being returned is considered allocated.
963   slot_span->num_allocated_slots++;
964   // Round down, because a slot that doesn't fully fit in the new page(s) isn't
965   // provisioned.
966   size_t slots_to_provision = (commit_end - return_slot) / slot_size;
967   slot_span->num_unprovisioned_slots -= slots_to_provision;
968   PA_DCHECK(slot_span->num_allocated_slots +
969                 slot_span->num_unprovisioned_slots <=
970             get_slots_per_span());
971 
972 #if PA_CONFIG(HAS_MEMORY_TAGGING)
973   const bool use_tagging =
974       root->IsMemoryTaggingEnabled() && slot_size <= kMaxMemoryTaggingSize;
975   if (PA_LIKELY(use_tagging)) {
976     // Ensure the MTE-tag of the memory pointed by |return_slot| is unguessable.
977     TagMemoryRangeRandomly(return_slot, TagSizeForSlot(root, slot_size));
978   }
979 #endif  // PA_CONFIG(HAS_MEMORY_TAGGING)
980   // Add all slots that fit within so far committed pages to the free list.
981   EncodedNextFreelistEntry* prev_entry = nullptr;
982   uintptr_t next_slot_end = next_slot + slot_size;
983   size_t free_list_entries_added = 0;
984   while (next_slot_end <= commit_end) {
985     void* next_slot_ptr;
986 #if PA_CONFIG(HAS_MEMORY_TAGGING)
987     if (PA_LIKELY(use_tagging)) {
988       // Ensure the MTE-tag of the memory pointed by other provisioned slot is
989       // unguessable. They will be returned to the app as is, and the MTE-tag
990       // will only change upon calling Free().
991       next_slot_ptr =
992           TagMemoryRangeRandomly(next_slot, TagSizeForSlot(root, slot_size));
993     } else {
994       // No MTE-tagging for larger slots, just cast.
995       next_slot_ptr = reinterpret_cast<void*>(next_slot);
996     }
997 #else  // PA_CONFIG(HAS_MEMORY_TAGGING)
998     next_slot_ptr = reinterpret_cast<void*>(next_slot);
999 #endif
1000     auto* entry = EncodedNextFreelistEntry::EmplaceAndInitNull(next_slot_ptr);
1001     if (!slot_span->get_freelist_head()) {
1002       PA_DCHECK(!prev_entry);
1003       PA_DCHECK(!free_list_entries_added);
1004       slot_span->SetFreelistHead(entry);
1005     } else {
1006       PA_DCHECK(free_list_entries_added);
1007       prev_entry->SetNext(entry);
1008     }
1009 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1010     FreeSlotBitmapMarkSlotAsFree(next_slot);
1011 #endif
1012     next_slot = next_slot_end;
1013     next_slot_end = next_slot + slot_size;
1014     prev_entry = entry;
1015 #if BUILDFLAG(PA_DCHECK_IS_ON)
1016     free_list_entries_added++;
1017 #endif
1018   }
1019 
1020 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1021   FreeSlotBitmapMarkSlotAsFree(return_slot);
1022 #endif
1023 
1024 #if BUILDFLAG(PA_DCHECK_IS_ON)
1025   // The only provisioned slot not added to the free list is the one being
1026   // returned.
1027   PA_DCHECK(slots_to_provision == free_list_entries_added + 1);
1028   // We didn't necessarily provision more than one slot (e.g. if |slot_size|
1029   // is large), meaning that |slot_span->freelist_head| can be nullptr.
1030   if (slot_span->get_freelist_head()) {
1031     PA_DCHECK(free_list_entries_added);
1032     slot_span->get_freelist_head()->CheckFreeList(slot_size);
1033   }
1034 #endif
1035 
1036   // We had no free slots, and created some (potentially 0) in sorted order.
1037   slot_span->set_freelist_sorted();
1038 
1039   return return_slot;
1040 }
1041 
SetNewActiveSlotSpan()1042 bool PartitionBucket::SetNewActiveSlotSpan() {
1043   SlotSpanMetadata* slot_span = active_slot_spans_head;
1044   if (slot_span == SlotSpanMetadata::get_sentinel_slot_span()) {
1045     return false;
1046   }
1047 
1048   SlotSpanMetadata* next_slot_span;
1049 
1050   // The goal here is to find a suitable slot span in the active list. Suitable
1051   // slot spans are |is_active()|, i.e. they either have (a) freelist entries,
1052   // or (b) unprovisioned free space. The first case is preferable, since it
1053   // doesn't cost a system call, and doesn't cause new memory to become dirty.
1054   //
1055   // While looking for a new slot span, active list maintenance is performed,
1056   // that is:
1057   // - Empty and decommitted slot spans are moved to their respective lists.
1058   // - Full slot spans are removed from the active list but are not moved
1059   //   anywhere. They could be tracked in a separate list, but this would
1060   //   increase cost non trivially. Indeed, a full slot span is likely to become
1061   //   non-full at some point (due to a free() hitting it). Since we only have
1062   //   space in the metadata for a single linked list pointer, removing the
1063   //   newly-non-full slot span from the "full" list would require walking it
1064   //   (to know what's before it in the full list).
1065   //
1066   // Since we prefer slot spans with provisioned freelist entries, maintenance
1067   // happens in two stages:
1068   // 1. Walk the list to find candidates. Each of the skipped slot span is moved
1069   //    to either:
1070   //   - one of the long-lived lists: empty, decommitted
1071   //   - the temporary "active slots spans with no freelist entry" list
1072   //   - Nowhere for full slot spans.
1073   // 2. Once we have a candidate:
1074   //   - Set it as the new active list head
1075   //   - Reattach the temporary list
1076   //
1077   // Note that in most cases, the whole list will not be walked and maintained
1078   // at this stage.
1079 
1080   SlotSpanMetadata* to_provision_head = nullptr;
1081   SlotSpanMetadata* to_provision_tail = nullptr;
1082 
1083   for (; slot_span; slot_span = next_slot_span) {
1084     next_slot_span = slot_span->next_slot_span;
1085     PA_DCHECK(slot_span->bucket == this);
1086     PA_DCHECK(slot_span != empty_slot_spans_head);
1087     PA_DCHECK(slot_span != decommitted_slot_spans_head);
1088 
1089     if (slot_span->is_active()) {
1090       // Has provisioned slots.
1091       if (slot_span->get_freelist_head()) {
1092         // Will use this slot span, no need to go further.
1093         break;
1094       } else {
1095         // Keeping head and tail because we don't want to reverse the list.
1096         if (!to_provision_head) {
1097           to_provision_head = slot_span;
1098         }
1099         if (to_provision_tail) {
1100           to_provision_tail->next_slot_span = slot_span;
1101         }
1102         to_provision_tail = slot_span;
1103         slot_span->next_slot_span = nullptr;
1104       }
1105     } else if (slot_span->is_empty()) {
1106       slot_span->next_slot_span = empty_slot_spans_head;
1107       empty_slot_spans_head = slot_span;
1108     } else if (PA_LIKELY(slot_span->is_decommitted())) {
1109       slot_span->next_slot_span = decommitted_slot_spans_head;
1110       decommitted_slot_spans_head = slot_span;
1111     } else {
1112       PA_DCHECK(slot_span->is_full());
1113       // Move this slot span... nowhere, and also mark it as full. We need it
1114       // marked so that free'ing can tell, and move it back into the active
1115       // list.
1116       slot_span->marked_full = 1;
1117       ++num_full_slot_spans;
1118       // Overflow. Most likely a correctness issue in the code.  It is in theory
1119       // possible that the number of full slot spans really reaches (1 << 24),
1120       // but this is very unlikely (and not possible with most pool settings).
1121       PA_CHECK(num_full_slot_spans);
1122       // Not necessary but might help stop accidents.
1123       slot_span->next_slot_span = nullptr;
1124     }
1125   }
1126 
1127   bool usable_active_list_head = false;
1128   // Found an active slot span with provisioned entries on the freelist.
1129   if (slot_span) {
1130     usable_active_list_head = true;
1131     // We have active slot spans with unprovisioned entries. Re-attach them into
1132     // the active list, past the span with freelist entries.
1133     if (to_provision_head) {
1134       auto* next = slot_span->next_slot_span;
1135       slot_span->next_slot_span = to_provision_head;
1136       to_provision_tail->next_slot_span = next;
1137     }
1138     active_slot_spans_head = slot_span;
1139   } else if (to_provision_head) {
1140     usable_active_list_head = true;
1141     // Need to provision new slots.
1142     active_slot_spans_head = to_provision_head;
1143   } else {
1144     // Active list is now empty.
1145     active_slot_spans_head =
1146         SlotSpanMetadata::get_sentinel_slot_span_non_const();
1147   }
1148 
1149   return usable_active_list_head;
1150 }
1151 
MaintainActiveList()1152 void PartitionBucket::MaintainActiveList() {
1153   SlotSpanMetadata* slot_span = active_slot_spans_head;
1154   if (slot_span == SlotSpanMetadata::get_sentinel_slot_span()) {
1155     return;
1156   }
1157 
1158   SlotSpanMetadata* new_active_slot_spans_head = nullptr;
1159   SlotSpanMetadata* new_active_slot_spans_tail = nullptr;
1160 
1161   SlotSpanMetadata* next_slot_span;
1162   for (; slot_span; slot_span = next_slot_span) {
1163     next_slot_span = slot_span->next_slot_span;
1164 
1165     if (slot_span->is_active()) {
1166       // Ordering in the active slot span list matters, don't reverse it.
1167       if (!new_active_slot_spans_head) {
1168         new_active_slot_spans_head = slot_span;
1169       }
1170       if (new_active_slot_spans_tail) {
1171         new_active_slot_spans_tail->next_slot_span = slot_span;
1172       }
1173       new_active_slot_spans_tail = slot_span;
1174       slot_span->next_slot_span = nullptr;
1175     } else if (slot_span->is_empty()) {
1176       // For the empty and decommitted lists, LIFO ordering makes sense (since
1177       // it would lead to reusing memory which has been touched relatively
1178       // recently, which only matters for committed spans though).
1179       slot_span->next_slot_span = empty_slot_spans_head;
1180       empty_slot_spans_head = slot_span;
1181     } else if (slot_span->is_decommitted()) {
1182       slot_span->next_slot_span = decommitted_slot_spans_head;
1183       decommitted_slot_spans_head = slot_span;
1184     } else {
1185       // Full slot spans are not tracked, just accounted for.
1186       PA_DCHECK(slot_span->is_full());
1187       slot_span->marked_full = 1;
1188       ++num_full_slot_spans;
1189       PA_CHECK(num_full_slot_spans);  // Overflow.
1190       slot_span->next_slot_span = nullptr;
1191     }
1192   }
1193 
1194   if (!new_active_slot_spans_head) {
1195     new_active_slot_spans_head =
1196         SlotSpanMetadata::get_sentinel_slot_span_non_const();
1197   }
1198   active_slot_spans_head = new_active_slot_spans_head;
1199 }
1200 
SortSmallerSlotSpanFreeLists()1201 void PartitionBucket::SortSmallerSlotSpanFreeLists() {
1202   for (auto* slot_span = active_slot_spans_head; slot_span;
1203        slot_span = slot_span->next_slot_span) {
1204     // No need to sort the freelist if it's already sorted. Note that if the
1205     // freelist is sorted, this means that it didn't change at all since the
1206     // last call. This may be a good signal to shrink it if possible (if an
1207     // entire OS page is free, we can decommit it).
1208     //
1209     // Besides saving CPU, this also avoids touching memory of fully idle slot
1210     // spans, which may required paging.
1211     if (slot_span->num_allocated_slots > 0 &&
1212         !slot_span->freelist_is_sorted()) {
1213       slot_span->SortFreelist();
1214     }
1215   }
1216 }
1217 
PA_COMPONENT_EXPORT(PARTITION_ALLOC)1218 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
1219 bool CompareSlotSpans(SlotSpanMetadata* a, SlotSpanMetadata* b) {
1220   auto criteria_tuple = [](SlotSpanMetadata const* a) {
1221     size_t freelist_length = a->GetFreelistLength();
1222     // The criteria are, in order (hence the lexicographic comparison below):
1223     // 1. Prefer slot spans with freelist entries. The ones without freelist
1224     //    entries would be skipped in SetNewActiveSlotSpan() anyway.
1225     // 2. Then the ones with the fewest freelist entries. They are either close
1226     //    to being full (for the provisioned memory), or close to being pushed
1227     //    at the end of the list (since they would not have freelist entries
1228     //    anymore, and would either fall into the first case, or be skipped by
1229     //    SetNewActiveSlotSpan()).
1230     // 3. The ones with the fewer unprovisioned slots, meaning that they are
1231     //    close to being completely full.
1232     //
1233     // Note that this sorting order is not necessarily the best one when slot
1234     // spans are partially provisioned. From local testing, in steady-state,
1235     // most slot spans are entirely provisioned (or decommitted), which may be a
1236     // consequence of the lack of partial slot span decommit, or of fairly
1237     // effective fragmentation avoidance heuristics. Make sure to evaluate
1238     // whether an alternative sorting order (sorting according to freelist size
1239     // + unprovisioned slots) makes more sense.
1240     return std::tuple<bool, size_t, size_t>{
1241         freelist_length == 0, freelist_length, a->num_unprovisioned_slots};
1242   };
1243 
1244   return criteria_tuple(a) < criteria_tuple(b);
1245 }
1246 
SortActiveSlotSpans()1247 void PartitionBucket::SortActiveSlotSpans() {
1248   // Sorting up to |kMaxSlotSpansToSort| slot spans. This is capped for two
1249   // reasons:
1250   // - Limiting execution time
1251   // - Current code cannot allocate.
1252   //
1253   // In practice though, it's rare to have that many active slot spans.
1254   SlotSpanMetadata* active_spans_array[kMaxSlotSpansToSort];
1255   size_t index = 0;
1256   SlotSpanMetadata* overflow_spans_start = nullptr;
1257 
1258   for (auto* slot_span = active_slot_spans_head; slot_span;
1259        slot_span = slot_span->next_slot_span) {
1260     if (index < kMaxSlotSpansToSort) {
1261       active_spans_array[index++] = slot_span;
1262     } else {
1263       // Starting from this one, not sorting the slot spans.
1264       overflow_spans_start = slot_span;
1265       break;
1266     }
1267   }
1268 
1269   // We sort the active slot spans so that allocations are preferably serviced
1270   // from the fullest ones. This way we hope to reduce fragmentation by keeping
1271   // as few slot spans as full as possible.
1272   //
1273   // With perfect information on allocation lifespan, we would be able to pack
1274   // allocations and get almost no fragmentation. This is obviously not the
1275   // case, so we have partially full SlotSpans. Nevertheless, as a heuristic we
1276   // want to:
1277   // - Keep almost-empty slot spans as empty as possible
1278   // - Keep mostly-full slot spans as full as possible
1279   //
1280   // The first part is done in the hope that future free()s will make these
1281   // slot spans completely empty, allowing us to reclaim them. To that end, sort
1282   // SlotSpans periodically so that the fullest ones are preferred.
1283   //
1284   // std::sort() is not completely guaranteed to never allocate memory. However,
1285   // it may not throw std::bad_alloc, which constrains the implementation. In
1286   // addition, this is protected by the reentrancy guard, so we would detect
1287   // such an allocation.
1288   std::sort(active_spans_array, active_spans_array + index, CompareSlotSpans);
1289 
1290   active_slot_spans_head = overflow_spans_start;
1291 
1292   // Reverse order, since we insert at the head of the list.
1293   for (int i = index - 1; i >= 0; i--) {
1294     if (active_spans_array[i] == SlotSpanMetadata::get_sentinel_slot_span()) {
1295       // The sentinel is const, don't try to write to it.
1296       PA_DCHECK(active_slot_spans_head == nullptr);
1297     } else {
1298       active_spans_array[i]->next_slot_span = active_slot_spans_head;
1299     }
1300     active_slot_spans_head = active_spans_array[i];
1301   }
1302 }
1303 
SlowPathAlloc(PartitionRoot * root,AllocFlags flags,size_t raw_size,size_t slot_span_alignment,bool * is_already_zeroed)1304 uintptr_t PartitionBucket::SlowPathAlloc(PartitionRoot* root,
1305                                          AllocFlags flags,
1306                                          size_t raw_size,
1307                                          size_t slot_span_alignment,
1308                                          bool* is_already_zeroed) {
1309   PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
1310             std::has_single_bit(slot_span_alignment));
1311 
1312   // The slow path is called when the freelist is empty. The only exception is
1313   // when a higher-order alignment is requested, in which case the freelist
1314   // logic is bypassed and we go directly for slot span allocation.
1315   bool allocate_aligned_slot_span = slot_span_alignment > PartitionPageSize();
1316   PA_DCHECK(!active_slot_spans_head->get_freelist_head() ||
1317             allocate_aligned_slot_span);
1318 
1319   SlotSpanMetadata* new_slot_span = nullptr;
1320   // |new_slot_span->bucket| will always be |this|, except when |this| is the
1321   // sentinel bucket, which is used to signal a direct mapped allocation.  In
1322   // this case |new_bucket| will be set properly later. This avoids a read for
1323   // most allocations.
1324   PartitionBucket* new_bucket = this;
1325   *is_already_zeroed = false;
1326 
1327   // For the PartitionRoot::Alloc() API, we have a bunch of buckets
1328   // marked as special cases. We bounce them through to the slow path so that
1329   // we can still have a blazing fast hot path due to lack of corner-case
1330   // branches.
1331   //
1332   // Note: The ordering of the conditionals matter! In particular,
1333   // SetNewActiveSlotSpan() has a side-effect even when returning
1334   // false where it sweeps the active list and may move things into the empty or
1335   // decommitted lists which affects the subsequent conditional.
1336   if (PA_UNLIKELY(is_direct_mapped())) {
1337     PA_DCHECK(raw_size > kMaxBucketed);
1338     PA_DCHECK(this == &root->sentinel_bucket);
1339     PA_DCHECK(active_slot_spans_head ==
1340               SlotSpanMetadata::get_sentinel_slot_span());
1341 
1342     // No fast path for direct-mapped allocations.
1343     if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1344       return 0;
1345     }
1346 
1347     new_slot_span =
1348         PartitionDirectMap(root, flags, raw_size, slot_span_alignment);
1349     if (new_slot_span) {
1350       new_bucket = new_slot_span->bucket;
1351     }
1352     // Memory from PageAllocator is always zeroed.
1353     *is_already_zeroed = true;
1354   } else if (PA_LIKELY(!allocate_aligned_slot_span && SetNewActiveSlotSpan())) {
1355     // First, did we find an active slot span in the active list?
1356     new_slot_span = active_slot_spans_head;
1357     PA_DCHECK(new_slot_span->is_active());
1358   } else if (PA_LIKELY(!allocate_aligned_slot_span &&
1359                        (empty_slot_spans_head != nullptr ||
1360                         decommitted_slot_spans_head != nullptr))) {
1361     // Second, look in our lists of empty and decommitted slot spans.
1362     // Check empty slot spans first, which are preferred, but beware that an
1363     // empty slot span might have been decommitted.
1364     while (PA_LIKELY((new_slot_span = empty_slot_spans_head) != nullptr)) {
1365       PA_DCHECK(new_slot_span->bucket == this);
1366       PA_DCHECK(new_slot_span->is_empty() || new_slot_span->is_decommitted());
1367       empty_slot_spans_head = new_slot_span->next_slot_span;
1368       // Accept the empty slot span unless it got decommitted.
1369       if (new_slot_span->get_freelist_head()) {
1370         new_slot_span->next_slot_span = nullptr;
1371         new_slot_span->ToSuperPageExtent()
1372             ->IncrementNumberOfNonemptySlotSpans();
1373 
1374         // Re-activating an empty slot span, update accounting.
1375         size_t dirty_size = base::bits::AlignUp(
1376             new_slot_span->GetProvisionedSize(), SystemPageSize());
1377         PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
1378         root->empty_slot_spans_dirty_bytes -= dirty_size;
1379 
1380         break;
1381       }
1382       PA_DCHECK(new_slot_span->is_decommitted());
1383       new_slot_span->next_slot_span = decommitted_slot_spans_head;
1384       decommitted_slot_spans_head = new_slot_span;
1385     }
1386     if (PA_UNLIKELY(!new_slot_span) &&
1387         PA_LIKELY(decommitted_slot_spans_head != nullptr)) {
1388       // Commit can be expensive, don't do it.
1389       if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1390         return 0;
1391       }
1392 
1393       new_slot_span = decommitted_slot_spans_head;
1394       PA_DCHECK(new_slot_span->bucket == this);
1395       PA_DCHECK(new_slot_span->is_decommitted());
1396 
1397       // If lazy commit is enabled, pages will be recommitted when provisioning
1398       // slots, in ProvisionMoreSlotsAndAllocOne(), not here.
1399       if (!kUseLazyCommit) {
1400         uintptr_t slot_span_start =
1401             SlotSpanMetadata::ToSlotSpanStart(new_slot_span);
1402         // Since lazy commit isn't used, we have a guarantee that all slot span
1403         // pages have been previously committed, and then decommitted using
1404         // PageAccessibilityDisposition::kAllowKeepForPerf, so use the
1405         // same option as an optimization.
1406         const bool ok = root->TryRecommitSystemPagesForDataLocked(
1407             slot_span_start, new_slot_span->bucket->get_bytes_per_span(),
1408             PageAccessibilityDisposition::kAllowKeepForPerf,
1409             slot_size <= kMaxMemoryTaggingSize);
1410         if (!ok) {
1411           if (!ContainsFlags(flags, AllocFlags::kReturnNull)) {
1412             ScopedUnlockGuard unlock{PartitionRootLock(root)};
1413             PartitionOutOfMemoryCommitFailure(
1414                 root, new_slot_span->bucket->get_bytes_per_span());
1415           }
1416           return 0;
1417         }
1418       }
1419 
1420       decommitted_slot_spans_head = new_slot_span->next_slot_span;
1421       new_slot_span->Reset();
1422       *is_already_zeroed = DecommittedMemoryIsAlwaysZeroed();
1423     }
1424     PA_DCHECK(new_slot_span);
1425   } else {
1426     // Getting a new slot span is expensive, don't do it.
1427     if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1428       return 0;
1429     }
1430 
1431     // Third. If we get here, we need a brand new slot span.
1432     // TODO(bartekn): For single-slot slot spans, we can use rounded raw_size
1433     // as slot_span_committed_size.
1434     new_slot_span = AllocNewSlotSpan(root, flags, slot_span_alignment);
1435     // New memory from PageAllocator is always zeroed.
1436     *is_already_zeroed = true;
1437   }
1438 
1439   // Bail if we had a memory allocation failure.
1440   if (PA_UNLIKELY(!new_slot_span)) {
1441     PA_DCHECK(active_slot_spans_head ==
1442               SlotSpanMetadata::get_sentinel_slot_span());
1443     if (ContainsFlags(flags, AllocFlags::kReturnNull)) {
1444       return 0;
1445     }
1446     // See comment in PartitionDirectMap() for unlocking.
1447     ScopedUnlockGuard unlock{PartitionRootLock(root)};
1448     root->OutOfMemory(raw_size);
1449     PA_IMMEDIATE_CRASH();  // Not required, kept as documentation.
1450   }
1451 
1452   PA_DCHECK(new_bucket != &root->sentinel_bucket);
1453   new_bucket->active_slot_spans_head = new_slot_span;
1454   if (new_slot_span->CanStoreRawSize()) {
1455     new_slot_span->SetRawSize(raw_size);
1456   }
1457 
1458   // If we found an active slot span with free slots, or an empty slot span, we
1459   // have a usable freelist head.
1460   if (PA_LIKELY(new_slot_span->get_freelist_head() != nullptr)) {
1461     EncodedNextFreelistEntry* entry =
1462         new_slot_span->PopForAlloc(new_bucket->slot_size);
1463 
1464     // We may have set *is_already_zeroed to true above, make sure that the
1465     // freelist entry doesn't contain data. Either way, it wouldn't be a good
1466     // idea to let users see our internal data.
1467     uintptr_t slot_start = entry->ClearForAllocation();
1468     return slot_start;
1469   }
1470 
1471   // Otherwise, we need to provision more slots by committing more pages. Build
1472   // the free list for the newly provisioned slots.
1473   PA_DCHECK(new_slot_span->num_unprovisioned_slots);
1474   return ProvisionMoreSlotsAndAllocOne(root, flags, new_slot_span);
1475 }
1476 
AllocNewSuperPageSpanForGwpAsan(PartitionRoot * root,size_t super_page_count,AllocFlags flags)1477 uintptr_t PartitionBucket::AllocNewSuperPageSpanForGwpAsan(
1478     PartitionRoot* root,
1479     size_t super_page_count,
1480     AllocFlags flags) {
1481   return AllocNewSuperPageSpan(root, super_page_count, flags);
1482 }
1483 
InitializeSlotSpanForGwpAsan(SlotSpanMetadata * slot_span)1484 void PartitionBucket::InitializeSlotSpanForGwpAsan(
1485     SlotSpanMetadata* slot_span) {
1486   InitializeSlotSpan(slot_span);
1487 }
1488 
1489 }  // namespace partition_alloc::internal
1490