1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
7
8 #include <algorithm>
9 #include <cerrno>
10 #include <cstdint>
11 #include <cstring>
12
13 #include <sys/mman.h>
14
15 #include "base/allocator/partition_allocator/oom.h"
16 #include "base/allocator/partition_allocator/page_allocator.h"
17 #include "base/allocator/partition_allocator/partition_alloc_base/debug/debugging_buildflags.h"
18 #include "base/allocator/partition_allocator/partition_alloc_base/posix/eintr_wrapper.h"
19 #include "base/allocator/partition_allocator/partition_alloc_check.h"
20 #include "base/allocator/partition_allocator/pkey.h"
21 #include "build/build_config.h"
22
23 #if BUILDFLAG(IS_APPLE)
24 #include "base/allocator/partition_allocator/partition_alloc_base/mac/foundation_util.h"
25 #if BUILDFLAG(IS_IOS)
26 #include "base/allocator/partition_allocator/partition_alloc_base/ios/ios_util.h"
27 #elif BUILDFLAG(IS_MAC)
28 #include "base/allocator/partition_allocator/partition_alloc_base/mac/mac_util.h"
29 #else
30 #error "Unknown platform"
31 #endif
32 #include "base/allocator/partition_allocator/partition_alloc_base/mac/scoped_cftyperef.h"
33
34 #include <Availability.h>
35 #include <Security/Security.h>
36 #include <mach/mach.h>
37 #endif
38 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
39 #include <sys/prctl.h>
40 #endif
41 #if BUILDFLAG(IS_LINUX) || BUILDFLAG(IS_CHROMEOS)
42 #include <sys/resource.h>
43 #endif
44
45 #ifndef MAP_ANONYMOUS
46 #define MAP_ANONYMOUS MAP_ANON
47 #endif
48
49 #if BUILDFLAG(IS_MAC)
50
51 // SecTaskGetCodeSignStatus is marked as unavailable on macOS, although it’s
52 // available on iOS and other Apple operating systems. It is, in fact, present
53 // on the system since macOS 10.12.
54 #pragma clang diagnostic push
55 #pragma clang diagnostic ignored "-Wavailability"
56 uint32_t SecTaskGetCodeSignStatus(SecTaskRef task) API_AVAILABLE(macos(10.12));
57 #pragma clang diagnostic pop
58
59 #endif // BUILDFLAG(IS_MAC)
60
61 namespace partition_alloc::internal {
62
63 namespace {
64
65 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
66 #if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
PageTagToName(PageTag tag)67 const char* PageTagToName(PageTag tag) {
68 // Important: All the names should be string literals. As per prctl.h in
69 // //third_party/android_ndk the kernel keeps a pointer to the name instead
70 // of copying it.
71 //
72 // Having the name in .rodata ensures that the pointer remains valid as
73 // long as the mapping is alive.
74 switch (tag) {
75 case PageTag::kBlinkGC:
76 return "blink_gc";
77 case PageTag::kPartitionAlloc:
78 return "partition_alloc";
79 case PageTag::kChromium:
80 return "chromium";
81 case PageTag::kV8:
82 return "v8";
83 default:
84 PA_DCHECK(false);
85 return "";
86 }
87 }
88 #endif
89 #endif // BUILDFLAG(IS_ANDROID)
90
91 #if BUILDFLAG(IS_MAC)
92 // Tests whether the version of macOS supports the MAP_JIT flag and if the
93 // current process is signed with the hardened runtime and the allow-jit
94 // entitlement, returning whether MAP_JIT should be used to allocate regions
95 // that will contain JIT-compiled executable code.
UseMapJit()96 bool UseMapJit() {
97 if (!base::mac::IsAtLeastOS10_14()) {
98 // MAP_JIT existed before macOS 10.14, but had somewhat different semantics.
99 // Only one MAP_JIT region was permitted per process, but calling code here
100 // will very likely require more than one such region. Since MAP_JIT is not
101 // strictly necessary to write code to a region and then execute it on these
102 // older OSes, don’t use it at all.
103 return false;
104 }
105
106 // Until determining that the hardened runtime is enabled, early returns will
107 // return true, so that MAP_JIT will be used. This is important on arm64,
108 // which only allows pages to be simultaneously writable and executable when
109 // in a region allocated with MAP_JIT, regardless of code signing options. On
110 // arm64, an attempt to set a non-MAP_JIT page as simultaneously writable and
111 // executable fails with EPERM. Although this is not enforced on x86_64,
112 // MAP_JIT is harmless in that case.
113
114 base::ScopedCFTypeRef<SecTaskRef> task(
115 SecTaskCreateFromSelf(kCFAllocatorDefault));
116 if (!task) {
117 return true;
118 }
119
120 uint32_t flags = SecTaskGetCodeSignStatus(task);
121 if (!(flags & kSecCodeSignatureRuntime)) {
122 // The hardened runtime is not enabled. Note that kSecCodeSignatureRuntime
123 // == CS_RUNTIME.
124 return true;
125 }
126
127 // The hardened runtime is enabled. From this point on, early returns must
128 // return false, indicating that MAP_JIT is not to be used. It’s an error
129 // (EINVAL) to use MAP_JIT with the hardened runtime unless the JIT
130 // entitlement is specified.
131
132 base::ScopedCFTypeRef<CFTypeRef> jit_entitlement(
133 SecTaskCopyValueForEntitlement(
134 task.get(), CFSTR("com.apple.security.cs.allow-jit"), nullptr));
135 if (!jit_entitlement) {
136 return false;
137 }
138
139 return base::mac::CFCast<CFBooleanRef>(jit_entitlement.get()) ==
140 kCFBooleanTrue;
141 }
142 #elif BUILDFLAG(IS_IOS)
UseMapJit()143 bool UseMapJit() {
144 // Always enable MAP_JIT in simulator as it is supported unconditionally.
145 #if TARGET_IPHONE_SIMULATOR
146 return true;
147 #else
148 // TODO(https://crbug.com/1413818): Fill this out when the API it is
149 // available.
150 return false;
151 #endif // TARGET_IPHONE_SIMULATOR
152 }
153 #endif // BUILDFLAG(IS_IOS)
154 } // namespace
155
156 // |mmap| uses a nearby address if the hint address is blocked.
157 constexpr bool kHintIsAdvisory = true;
158 std::atomic<int32_t> s_allocPageErrorCode{0};
159
160 int GetAccessFlags(PageAccessibilityConfiguration accessibility);
161
SystemAllocPagesInternal(uintptr_t hint,size_t length,PageAccessibilityConfiguration accessibility,PageTag page_tag,int file_descriptor_for_shared_alloc)162 uintptr_t SystemAllocPagesInternal(uintptr_t hint,
163 size_t length,
164 PageAccessibilityConfiguration accessibility,
165 PageTag page_tag,
166 int file_descriptor_for_shared_alloc) {
167 #if BUILDFLAG(IS_APPLE)
168 // Use a custom tag to make it easier to distinguish Partition Alloc regions
169 // in vmmap(1). Tags between 240-255 are supported.
170 PA_DCHECK(PageTag::kFirst <= page_tag);
171 PA_DCHECK(PageTag::kLast >= page_tag);
172 int fd = file_descriptor_for_shared_alloc == -1
173 ? VM_MAKE_TAG(static_cast<int>(page_tag))
174 : file_descriptor_for_shared_alloc;
175 #else
176 int fd = file_descriptor_for_shared_alloc;
177 #endif
178
179 int access_flag = GetAccessFlags(accessibility);
180 int map_flags = MAP_ANONYMOUS | MAP_PRIVATE;
181
182 #if BUILDFLAG(IS_APPLE)
183 // On macOS 10.14 and higher, executables that are code signed with the
184 // "runtime" option cannot execute writable memory by default. They can opt
185 // into this capability by specifying the "com.apple.security.cs.allow-jit"
186 // code signing entitlement and allocating the region with the MAP_JIT flag.
187 static const bool kUseMapJit = UseMapJit();
188 if (page_tag == PageTag::kV8 && kUseMapJit) {
189 map_flags |= MAP_JIT;
190 }
191 #endif
192
193 void* ret = mmap(reinterpret_cast<void*>(hint), length, access_flag,
194 map_flags, fd, 0);
195 if (ret == MAP_FAILED) {
196 s_allocPageErrorCode = errno;
197 ret = nullptr;
198 }
199
200 #if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_LINUX)
201 #if defined(PR_SET_VMA) && defined(PR_SET_VMA_ANON_NAME)
202 // On Android and Linux, anonymous mappings can have a name attached to them.
203 // This is useful for debugging, and double-checking memory attribution.
204 if (ret) {
205 // No error checking on purpose, testing only.
206 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ret, length,
207 PageTagToName(page_tag));
208 }
209 #endif
210 #endif
211
212 return reinterpret_cast<uintptr_t>(ret);
213 }
214
TrySetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)215 bool TrySetSystemPagesAccessInternal(
216 uintptr_t address,
217 size_t length,
218 PageAccessibilityConfiguration accessibility) {
219 #if BUILDFLAG(ENABLE_PKEYS)
220 return 0 == PkeyMprotectIfEnabled(reinterpret_cast<void*>(address), length,
221 GetAccessFlags(accessibility),
222 accessibility.pkey);
223 #else
224 return 0 == PA_HANDLE_EINTR(mprotect(reinterpret_cast<void*>(address), length,
225 GetAccessFlags(accessibility)));
226 #endif
227 }
228
SetSystemPagesAccessInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility)229 void SetSystemPagesAccessInternal(
230 uintptr_t address,
231 size_t length,
232 PageAccessibilityConfiguration accessibility) {
233 int access_flags = GetAccessFlags(accessibility);
234 #if BUILDFLAG(ENABLE_PKEYS)
235 int ret =
236 PkeyMprotectIfEnabled(reinterpret_cast<void*>(address), length,
237 GetAccessFlags(accessibility), accessibility.pkey);
238 #else
239 int ret = PA_HANDLE_EINTR(mprotect(reinterpret_cast<void*>(address), length,
240 GetAccessFlags(accessibility)));
241 #endif
242
243 // On Linux, man mprotect(2) states that ENOMEM is returned when (1) internal
244 // kernel data structures cannot be allocated, (2) the address range is
245 // invalid, or (3) this would split an existing mapping in a way that would
246 // exceed the maximum number of allowed mappings.
247 //
248 // Neither are very likely, but we still get a lot of crashes here. This is
249 // because setrlimit(RLIMIT_DATA)'s limit is checked and enforced here, if the
250 // access flags match a "data" mapping, which in our case would be MAP_PRIVATE
251 // | MAP_ANONYMOUS, and PROT_WRITE. see the call to may_expand_vm() in
252 // mm/mprotect.c in the kernel for details.
253 //
254 // In this case, we are almost certainly bumping into the sandbox limit, mark
255 // the crash as OOM. See SandboxLinux::LimitAddressSpace() for details.
256 if (ret == -1 && errno == ENOMEM && (access_flags & PROT_WRITE)) {
257 OOM_CRASH(length);
258 }
259
260 PA_PCHECK(0 == ret);
261 }
262
FreePagesInternal(uintptr_t address,size_t length)263 void FreePagesInternal(uintptr_t address, size_t length) {
264 PA_PCHECK(0 == munmap(reinterpret_cast<void*>(address), length));
265 }
266
TrimMappingInternal(uintptr_t base_address,size_t base_length,size_t trim_length,PageAccessibilityConfiguration accessibility,size_t pre_slack,size_t post_slack)267 uintptr_t TrimMappingInternal(uintptr_t base_address,
268 size_t base_length,
269 size_t trim_length,
270 PageAccessibilityConfiguration accessibility,
271 size_t pre_slack,
272 size_t post_slack) {
273 uintptr_t ret = base_address;
274 // We can resize the allocation run. Release unneeded memory before and after
275 // the aligned range.
276 if (pre_slack) {
277 FreePages(base_address, pre_slack);
278 ret = base_address + pre_slack;
279 }
280 if (post_slack) {
281 FreePages(ret + trim_length, post_slack);
282 }
283 return ret;
284 }
285
DecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityDisposition accessibility_disposition)286 void DecommitSystemPagesInternal(
287 uintptr_t address,
288 size_t length,
289 PageAccessibilityDisposition accessibility_disposition) {
290 // In POSIX, there is no decommit concept. Discarding is an effective way of
291 // implementing the Windows semantics where the OS is allowed to not swap the
292 // pages in the region.
293 DiscardSystemPages(address, length);
294
295 bool change_permissions =
296 accessibility_disposition == PageAccessibilityDisposition::kRequireUpdate;
297 #if BUILDFLAG(PA_DCHECK_IS_ON)
298 // This is not guaranteed, show that we're serious.
299 //
300 // More specifically, several callers have had issues with assuming that
301 // memory is zeroed, this would hopefully make these bugs more visible. We
302 // don't memset() everything, because ranges can be very large, and doing it
303 // over the entire range could make Chrome unusable with
304 // BUILDFLAG(PA_DCHECK_IS_ON).
305 //
306 // Only do it when we are about to change the permissions, since we don't know
307 // the previous permissions, and cannot restore them.
308 if (!DecommittedMemoryIsAlwaysZeroed() && change_permissions) {
309 // Memory may not be writable.
310 size_t size = std::min(length, 2 * SystemPageSize());
311 void* ptr = reinterpret_cast<void*>(address);
312 PA_CHECK(mprotect(ptr, size, PROT_WRITE) == 0);
313 memset(ptr, 0xcc, size);
314 }
315 #endif
316
317 // Make pages inaccessible, unless the caller requested to keep permissions.
318 //
319 // Note, there is a small window between these calls when the pages can be
320 // incorrectly touched and brought back to memory. Not ideal, but doing those
321 // operations in the opposite order resulted in PMF regression on Mac (see
322 // crbug.com/1153021).
323 if (change_permissions) {
324 SetSystemPagesAccess(address, length,
325 PageAccessibilityConfiguration(
326 PageAccessibilityConfiguration::kInaccessible));
327 }
328 }
329
DecommitAndZeroSystemPagesInternal(uintptr_t address,size_t length)330 void DecommitAndZeroSystemPagesInternal(uintptr_t address, size_t length) {
331 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mmap.html: "If
332 // a MAP_FIXED request is successful, then any previous mappings [...] for
333 // those whole pages containing any part of the address range [pa,pa+len)
334 // shall be removed, as if by an appropriate call to munmap(), before the
335 // new mapping is established." As a consequence, the memory will be
336 // zero-initialized on next access.
337 void* ptr = reinterpret_cast<void*>(address);
338 void* ret = mmap(ptr, length, PROT_NONE,
339 MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
340 PA_CHECK(ptr == ret);
341 }
342
RecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)343 void RecommitSystemPagesInternal(
344 uintptr_t address,
345 size_t length,
346 PageAccessibilityConfiguration accessibility,
347 PageAccessibilityDisposition accessibility_disposition) {
348 // On POSIX systems, the caller needs to simply read the memory to recommit
349 // it. However, if decommit changed the permissions, recommit has to change
350 // them back.
351 if (accessibility_disposition ==
352 PageAccessibilityDisposition::kRequireUpdate) {
353 SetSystemPagesAccess(address, length, accessibility);
354 }
355
356 #if BUILDFLAG(IS_APPLE)
357 // On macOS, to update accounting, we need to make another syscall. For more
358 // details, see https://crbug.com/823915.
359 madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
360 #endif
361 }
362
TryRecommitSystemPagesInternal(uintptr_t address,size_t length,PageAccessibilityConfiguration accessibility,PageAccessibilityDisposition accessibility_disposition)363 bool TryRecommitSystemPagesInternal(
364 uintptr_t address,
365 size_t length,
366 PageAccessibilityConfiguration accessibility,
367 PageAccessibilityDisposition accessibility_disposition) {
368 // On POSIX systems, the caller needs to simply read the memory to recommit
369 // it. However, if decommit changed the permissions, recommit has to change
370 // them back.
371 if (accessibility_disposition ==
372 PageAccessibilityDisposition::kRequireUpdate) {
373 bool ok = TrySetSystemPagesAccess(address, length, accessibility);
374 if (!ok) {
375 return false;
376 }
377 }
378
379 #if BUILDFLAG(IS_APPLE)
380 // On macOS, to update accounting, we need to make another syscall. For more
381 // details, see https://crbug.com/823915.
382 madvise(reinterpret_cast<void*>(address), length, MADV_FREE_REUSE);
383 #endif
384
385 return true;
386 }
387
DiscardSystemPagesInternal(uintptr_t address,size_t length)388 void DiscardSystemPagesInternal(uintptr_t address, size_t length) {
389 void* ptr = reinterpret_cast<void*>(address);
390 #if BUILDFLAG(IS_APPLE)
391 int ret = madvise(ptr, length, MADV_FREE_REUSABLE);
392 if (ret) {
393 // MADV_FREE_REUSABLE sometimes fails, so fall back to MADV_DONTNEED.
394 ret = madvise(ptr, length, MADV_DONTNEED);
395 }
396 PA_PCHECK(ret == 0);
397 #else // BUILDFLAG(IS_APPLE)
398 // We have experimented with other flags, but with suboptimal results.
399 //
400 // MADV_FREE (Linux): Makes our memory measurements less predictable;
401 // performance benefits unclear.
402 //
403 // Therefore, we just do the simple thing: MADV_DONTNEED.
404 PA_PCHECK(0 == madvise(ptr, length, MADV_DONTNEED));
405 #endif // BUILDFLAG(IS_APPLE)
406 }
407
408 } // namespace partition_alloc::internal
409
410 #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_PAGE_ALLOCATOR_INTERNALS_POSIX_H_
411