1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
7 
8 #include <algorithm>
9 #include <cerrno>
10 #include <cstdint>
11 #include <cstring>
12 
13 #include <sys/mman.h>
14 
15 #include "base/allocator/partition_allocator/oom.h"
16 #include "base/allocator/partition_allocator/page_allocator.h"
17 #include "base/allocator/partition_allocator/partition_alloc_base/debug/debugging_buildflags.h"
18 #include "base/allocator/partition_allocator/partition_alloc_base/posix/eintr_wrapper.h"
19 #include "base/allocator/partition_allocator/partition_alloc_check.h"
20 #include "base/allocator/partition_allocator/pkey.h"
21 #include "build/build_config.h"
22 
23 #if BUILDFLAG(IS_APPLE)
24 #include "base/allocator/partition_allocator/partition_alloc_base/mac/foundation_util.h"
25 #if BUILDFLAG(IS_IOS)
26 #include "base/allocator/partition_allocator/partition_alloc_base/ios/ios_util.h"
27 #elif BUILDFLAG(IS_MAC)
28 #include "base/allocator/partition_allocator/partition_alloc_base/mac/mac_util.h"
29 #else
30 #error "Unknown platform"
31 #endif
32 #include "base/allocator/partition_allocator/partition_alloc_base/mac/scoped_cftyperef.h"
33 
34 #include <Availability.h>
35 #include <Security/Security.h>
36 #include <mach/mach.h>
37 #endif
38 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
39 #include <sys/prctl.h>
40 #endif
41 #if BUILDFLAG(IS_LINUX) || BUILDFLAG(IS_CHROMEOS)
42 #include <sys/resource.h>
43 #endif
44 
45 #ifndef MAP_ANONYMOUS
46 #define MAP_ANONYMOUS MAP_ANON
47 #endif
48 
49 #if BUILDFLAG(IS_MAC)
50 
51 // SecTaskGetCodeSignStatus is marked as unavailable on macOS, although it’s
52 // available on iOS and other Apple operating systems. It is, in fact, present
53 // on the system since macOS 10.12.
54 #pragma clang diagnostic push
55 #pragma clang diagnostic ignored "-Wavailability"
56 uint32_t SecTaskGetCodeSignStatus(SecTaskRef task) API_AVAILABLE(macos(10.12));
57 #pragma clang diagnostic pop
58 
59 #endif  // BUILDFLAG(IS_MAC)
60 
61 namespace partition_alloc::internal {
62 
63 namespace {
64 
65 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
66 #if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
PageTagToName(PageTag tag)67 const char* PageTagToName(PageTag tag) {
68   // Important: All the names should be string literals. As per prctl.h in
69   // //third_party/android_ndk the kernel keeps a pointer to the name instead
70   // of copying it.
71   //
72   // Having the name in .rodata ensures that the pointer remains valid as
73   // long as the mapping is alive.
74   switch (tag) {
75     case PageTag::kBlinkGC:
76       return "blink_gc";
77     case PageTag::kPartitionAlloc:
78       return "partition_alloc";
79     case PageTag::kChromium:
80       return "chromium";
81     case PageTag::kV8:
82       return "v8";
83     default:
84       PA_DCHECK(false);
85       return "";
86   }
87 }
88 #endif
89 #endif  // BUILDFLAG(IS_ANDROID)
90 
91 #if BUILDFLAG(IS_MAC)
92 // Tests whether the version of macOS supports the MAP_JIT flag and if the
93 // current process is signed with the hardened runtime and the allow-jit
94 // entitlement, returning whether MAP_JIT should be used to allocate regions
95 // that will contain JIT-compiled executable code.
UseMapJit()96 bool UseMapJit() {
97   if (!base::mac::IsAtLeastOS10_14()) {
98     // MAP_JIT existed before macOS 10.14, but had somewhat different semantics.
99     // Only one MAP_JIT region was permitted per process, but calling code here
100     // will very likely require more than one such region. Since MAP_JIT is not
101     // strictly necessary to write code to a region and then execute it on these
102     // older OSes, don’t use it at all.
103     return false;
104   }
105 
106   // Until determining that the hardened runtime is enabled, early returns will
107   // return true, so that MAP_JIT will be used. This is important on arm64,
108   // which only allows pages to be simultaneously writable and executable when
109   // in a region allocated with MAP_JIT, regardless of code signing options. On
110   // arm64, an attempt to set a non-MAP_JIT page as simultaneously writable and
111   // executable fails with EPERM. Although this is not enforced on x86_64,
112   // MAP_JIT is harmless in that case.
113 
114   base::ScopedCFTypeRef<SecTaskRef> task(
115       SecTaskCreateFromSelf(kCFAllocatorDefault));
116   if (!task) {
117     return true;
118   }
119 
120   uint32_t flags = SecTaskGetCodeSignStatus(task);
121   if (!(flags & kSecCodeSignatureRuntime)) {
122     // The hardened runtime is not enabled. Note that kSecCodeSignatureRuntime
123     // == CS_RUNTIME.
124     return true;
125   }
126 
127   // The hardened runtime is enabled. From this point on, early returns must
128   // return false, indicating that MAP_JIT is not to be used. It’s an error
129   // (EINVAL) to use MAP_JIT with the hardened runtime unless the JIT
130   // entitlement is specified.
131 
132   base::ScopedCFTypeRef<CFTypeRef> jit_entitlement(
133       SecTaskCopyValueForEntitlement(
134           task.get(), CFSTR("com.apple.security.cs.allow-jit"), nullptr));
135   if (!jit_entitlement) {
136     return false;
137   }
138 
139   return base::mac::CFCast<CFBooleanRef>(jit_entitlement.get()) ==
140          kCFBooleanTrue;
141 }
142 #elif BUILDFLAG(IS_IOS)
UseMapJit()143 bool UseMapJit() {
144 // Always enable MAP_JIT in simulator as it is supported unconditionally.
145 #if TARGET_IPHONE_SIMULATOR
146   return true;
147 #else
148   // TODO(https://crbug.com/1413818): Fill this out when the API it is
149   // available.
150   return false;
151 #endif  // TARGET_IPHONE_SIMULATOR
152 }
153 #endif  // BUILDFLAG(IS_IOS)
154 }  // namespace
155 
156 // |mmap| uses a nearby address if the hint address is blocked.
157 constexpr bool kHintIsAdvisory = true;
158 std::atomic<int32_t> s_allocPageErrorCode{0};
159 
160 int GetAccessFlags(PageAccessibilityConfiguration accessibility);
161 
SystemAllocPagesInternal(uintptr_t hint,size_t length,PageAccessibilityConfiguration accessibility,PageTag page_tag,int file_descriptor_for_shared_alloc)162 uintptr_t SystemAllocPagesInternal(uintptr_t hint,
163                                    size_t length,
164                                    PageAccessibilityConfiguration accessibility,
165                                    PageTag page_tag,
166                                    int file_descriptor_for_shared_alloc) {
167 #if BUILDFLAG(IS_APPLE)
168   // Use a custom tag to make it easier to distinguish Partition Alloc regions
169   // in vmmap(1). Tags between 240-255 are supported.
170   PA_DCHECK(PageTag::kFirst <= page_tag);
171   PA_DCHECK(PageTag::kLast >= page_tag);
172   int fd = file_descriptor_for_shared_alloc == -1
173                ? VM_MAKE_TAG(static_cast<int>(page_tag))
174                : file_descriptor_for_shared_alloc;
175 #else
176   int fd = file_descriptor_for_shared_alloc;
177 #endif
178 
179   int access_flag = GetAccessFlags(accessibility);
180   int map_flags = MAP_ANONYMOUS | MAP_PRIVATE;
181 
182 #if BUILDFLAG(IS_APPLE)
183   // On macOS 10.14 and higher, executables that are code signed with the
184   // "runtime" option cannot execute writable memory by default. They can opt
185   // into this capability by specifying the "com.apple.security.cs.allow-jit"
186   // code signing entitlement and allocating the region with the MAP_JIT flag.
187   static const bool kUseMapJit = UseMapJit();
188   if (page_tag == PageTag::kV8 && kUseMapJit) {
189     map_flags |= MAP_JIT;
190   }
191 #endif
192 
193   void* ret = mmap(reinterpret_cast<void*>(hint), length, access_flag,
194                    map_flags, fd, 0);
195   if (ret == MAP_FAILED) {
196     s_allocPageErrorCode = errno;
197     ret = nullptr;
198   }
199 
200 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
201 #if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
202   // On Android and Linux, anonymous mappings can have a name attached to them.
203   // This is useful for debugging, and double-checking memory attribution.
204   if (ret) {
205     // No error checking on purpose, testing only.
206     prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ret, length,
207           PageTagToName(page_tag));
208   }
209 #endif
210 #endif
211 
212   return reinterpret_cast<uintptr_t>(ret);
213 }
214 
TrySetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)215 bool TrySetSystemPagesAccessInternal(
216     uintptr_t address,
217     size_t length,
218     PageAccessibilityConfiguration accessibility) {
219 #if BUILDFLAG(ENABLE_PKEYS)
220   return 0 == PkeyMprotectIfEnabled(reinterpret_cast<void*>(address), length,
221                                     GetAccessFlags(accessibility),
222                                     accessibility.pkey);
223 #else
224   return 0 == PA_HANDLE_EINTR(mprotect(reinterpret_cast<void*>(address), length,
225                                        GetAccessFlags(accessibility)));
226 #endif
227 }
228 
SetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)229 void SetSystemPagesAccessInternal(
230     uintptr_t address,
231     size_t length,
232     PageAccessibilityConfiguration accessibility) {
233   int access_flags = GetAccessFlags(accessibility);
234 #if BUILDFLAG(ENABLE_PKEYS)
235   int ret =
236       PkeyMprotectIfEnabled(reinterpret_cast<void*>(address), length,
237                             GetAccessFlags(accessibility), accessibility.pkey);
238 #else
239   int ret = PA_HANDLE_EINTR(mprotect(reinterpret_cast<void*>(address), length,
240                                      GetAccessFlags(accessibility)));
241 #endif
242 
243   // On Linux, man mprotect(2) states that ENOMEM is returned when (1) internal
244   // kernel data structures cannot be allocated, (2) the address range is
245   // invalid, or (3) this would split an existing mapping in a way that would
246   // exceed the maximum number of allowed mappings.
247   //
248   // Neither are very likely, but we still get a lot of crashes here. This is
249   // because setrlimit(RLIMIT_DATA)'s limit is checked and enforced here, if the
250   // access flags match a "data" mapping, which in our case would be MAP_PRIVATE
251   // | MAP_ANONYMOUS, and PROT_WRITE. see the call to may_expand_vm() in
252   // mm/mprotect.c in the kernel for details.
253   //
254   // In this case, we are almost certainly bumping into the sandbox limit, mark
255   // the crash as OOM. See SandboxLinux::LimitAddressSpace() for details.
256   if (ret == -1 && errno == ENOMEM && (access_flags & PROT_WRITE)) {
257     OOM_CRASH(length);
258   }
259 
260   PA_PCHECK(0 == ret);
261 }
262 
FreePagesInternal(uintptr_t address,size_t length)263 void FreePagesInternal(uintptr_t address, size_t length) {
264   PA_PCHECK(0 == munmap(reinterpret_cast<void*>(address), length));
265 }
266 
TrimMappingInternal(uintptr_t base_address,size_t base_length,size_t trim_length,PageAccessibilityConfiguration accessibility,size_t pre_slack,size_t post_slack)267 uintptr_t TrimMappingInternal(uintptr_t base_address,
268                               size_t base_length,
269                               size_t trim_length,
270                               PageAccessibilityConfiguration accessibility,
271                               size_t pre_slack,
272                               size_t post_slack) {
273   uintptr_t ret = base_address;
274   // We can resize the allocation run. Release unneeded memory before and after
275   // the aligned range.
276   if (pre_slack) {
277     FreePages(base_address, pre_slack);
278     ret = base_address + pre_slack;
279   }
280   if (post_slack) {
281     FreePages(ret + trim_length, post_slack);
282   }
283   return ret;
284 }
285 
DecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityDisposition accessibility_disposition)286 void DecommitSystemPagesInternal(
287     uintptr_t address,
288     size_t length,
289     PageAccessibilityDisposition accessibility_disposition) {
290   // In POSIX, there is no decommit concept. Discarding is an effective way of
291   // implementing the Windows semantics where the OS is allowed to not swap the
292   // pages in the region.
293   DiscardSystemPages(address, length);
294 
295   bool change_permissions =
296       accessibility_disposition == PageAccessibilityDisposition::kRequireUpdate;
297 #if BUILDFLAG(PA_DCHECK_IS_ON)
298   // This is not guaranteed, show that we're serious.
299   //
300   // More specifically, several callers have had issues with assuming that
301   // memory is zeroed, this would hopefully make these bugs more visible.  We
302   // don't memset() everything, because ranges can be very large, and doing it
303   // over the entire range could make Chrome unusable with
304   // BUILDFLAG(PA_DCHECK_IS_ON).
305   //
306   // Only do it when we are about to change the permissions, since we don't know
307   // the previous permissions, and cannot restore them.
308   if (!DecommittedMemoryIsAlwaysZeroed() && change_permissions) {
309     // Memory may not be writable.
310     size_t size = std::min(length, 2 * SystemPageSize());
311     void* ptr = reinterpret_cast<void*>(address);
312     PA_CHECK(mprotect(ptr, size, PROT_WRITE) == 0);
313     memset(ptr, 0xcc, size);
314   }
315 #endif
316 
317   // Make pages inaccessible, unless the caller requested to keep permissions.
318   //
319   // Note, there is a small window between these calls when the pages can be
320   // incorrectly touched and brought back to memory. Not ideal, but doing those
321   // operations in the opposite order resulted in PMF regression on Mac (see
322   // crbug.com/1153021).
323   if (change_permissions) {
324     SetSystemPagesAccess(address, length,
325                          PageAccessibilityConfiguration(
326                              PageAccessibilityConfiguration::kInaccessible));
327   }
328 }
329 
DecommitAndZeroSystemPagesInternal(uintptr_t address,size_t length)330 void DecommitAndZeroSystemPagesInternal(uintptr_t address, size_t length) {
331   // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mmap.html: "If
332   // a MAP_FIXED request is successful, then any previous mappings [...] for
333   // those whole pages containing any part of the address range [pa,pa+len)
334   // shall be removed, as if by an appropriate call to munmap(), before the
335   // new mapping is established." As a consequence, the memory will be
336   // zero-initialized on next access.
337   void* ptr = reinterpret_cast<void*>(address);
338   void* ret = mmap(ptr, length, PROT_NONE,
339                    MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
340   PA_CHECK(ptr == ret);
341 }
342 
RecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)343 void RecommitSystemPagesInternal(
344     uintptr_t address,
345     size_t length,
346     PageAccessibilityConfiguration accessibility,
347     PageAccessibilityDisposition accessibility_disposition) {
348   // On POSIX systems, the caller needs to simply read the memory to recommit
349   // it. However, if decommit changed the permissions, recommit has to change
350   // them back.
351   if (accessibility_disposition ==
352       PageAccessibilityDisposition::kRequireUpdate) {
353     SetSystemPagesAccess(address, length, accessibility);
354   }
355 
356 #if BUILDFLAG(IS_APPLE)
357   // On macOS, to update accounting, we need to make another syscall. For more
358   // details, see https://crbug.com/823915.
359   madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
360 #endif
361 }
362 
TryRecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)363 bool TryRecommitSystemPagesInternal(
364     uintptr_t address,
365     size_t length,
366     PageAccessibilityConfiguration accessibility,
367     PageAccessibilityDisposition accessibility_disposition) {
368   // On POSIX systems, the caller needs to simply read the memory to recommit
369   // it. However, if decommit changed the permissions, recommit has to change
370   // them back.
371   if (accessibility_disposition ==
372       PageAccessibilityDisposition::kRequireUpdate) {
373     bool ok = TrySetSystemPagesAccess(address, length, accessibility);
374     if (!ok) {
375       return false;
376     }
377   }
378 
379 #if BUILDFLAG(IS_APPLE)
380   // On macOS, to update accounting, we need to make another syscall. For more
381   // details, see https://crbug.com/823915.
382   madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
383 #endif
384 
385   return true;
386 }
387 
DiscardSystemPagesInternal(uintptr_t address,size_t length)388 void DiscardSystemPagesInternal(uintptr_t address, size_t length) {
389   void* ptr = reinterpret_cast<void*>(address);
390 #if BUILDFLAG(IS_APPLE)
391   int ret = madvise(ptr, length, MADV_FREE_REUSABLE);
392   if (ret) {
393     // MADV_FREE_REUSABLE sometimes fails, so fall back to MADV_DONTNEED.
394     ret = madvise(ptr, length, MADV_DONTNEED);
395   }
396   PA_PCHECK(ret == 0);
397 #else   // BUILDFLAG(IS_APPLE)
398   // We have experimented with other flags, but with suboptimal results.
399   //
400   // MADV_FREE (Linux): Makes our memory measurements less predictable;
401   // performance benefits unclear.
402   //
403   // Therefore, we just do the simple thing: MADV_DONTNEED.
404   PA_PCHECK(0 == madvise(ptr, length, MADV_DONTNEED));
405 #endif  // BUILDFLAG(IS_APPLE)
406 }
407 
408 }  // namespace partition_alloc::internal
409 
410 #endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
411