1 // Copyright (C) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"),
5 // to deal in the Software without restriction, including without limitation
6 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 // and/or sell copies of the Software, and to permit persons to whom
8 // the Software is furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included
11 // in all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
14 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
17 // OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
19 // OR OTHER DEALINGS IN THE SOFTWARE.
20 //
21 // SPDX-License-Identifier: MIT
22
23 // The functions in this file map the .text section of Node.js into 2MB pages.
24 // They perform the following steps:
25 //
26 // 1: Find the Node.js binary's `.text` section in memory. This is done below in
27 // `FindNodeTextRegion`. It is accomplished in a platform-specific way. On
28 // Linux and FreeBSD, `dl_iterate_phdr(3)` is used. When the region is found,
29 // it is "trimmed" as follows:
30 // * Modify the start to point to the very beginning of the Node.js `.text`
31 // section (from symbol `__node_text_start` declared in node_text_start.S).
32 // * Possibly modify the end to account for the `lpstub` section which
33 // contains `MoveTextRegionToLargePages`, the function we do not wish to
34 // move (see below).
35 // * Align the address of the start to its nearest higher large page
36 // boundary.
37 // * Align the address of the end to its nearest lower large page boundary.
38 //
39 // 2: Move the text region to large pages. This is done below in
40 // `MoveTextRegionToLargePages`. We need to be very careful:
41 // a) `MoveTextRegionToLargePages` itself should not be moved.
42 // We use gcc attributes
43 // (__section__) to put it outside the `.text` section,
44 // (__aligned__) to align it at the 2M boundary, and
45 // (__noline__) to not inline this function.
46 // b) `MoveTextRegionToLargePages` should not call any function(s) that might
47 // be moved.
48 // To move the .text section, perform the following steps:
49 // * Map a new, temporary area and copy the original code there.
50 // * Use mmap using the start address with MAP_FIXED so we get exactly the
51 // same virtual address (except on OSX). On platforms other than Linux,
52 // use mmap flags to request hugepages.
53 // * On Linux use madvise with MADV_HUGEPAGE to use anonymous 2MB pages.
54 // * If successful copy the code to the newly mapped area and protect it to
55 // be readable and executable.
56 // * Unmap the temporary area.
57
58 #include "node_large_page.h"
59
60 #include <cerrno> // NOLINT(build/include)
61
62 // Besides returning ENOTSUP at runtime we do nothing if this define is missing.
63 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
64 #include "debug_utils-inl.h"
65
66 #if defined(__linux__) || defined(__FreeBSD__)
67 #if defined(__linux__)
68 #ifndef _GNU_SOURCE
69 #define _GNU_SOURCE
70 #endif // ifndef _GNU_SOURCE
71 #elif defined(__FreeBSD__)
72 #include "uv.h" // uv_exepath
73 #endif // defined(__linux__)
74 #include <link.h>
75 #endif // defined(__linux__) || defined(__FreeBSD__)
76
77 #include <sys/types.h>
78 #include <sys/mman.h>
79 #if defined(__FreeBSD__)
80 #include <sys/sysctl.h>
81 #elif defined(__APPLE__)
82 #include <mach/vm_map.h>
83 #endif
84
85 #include <climits> // PATH_MAX
86 #include <cstdlib>
87 #include <cstdint>
88 #include <cstring>
89 #include <string>
90 #include <fstream>
91
92 #if defined(__linux__) || defined(__FreeBSD__)
93 extern "C" {
94 // This symbol must be declared weak because this file becomes part of all
95 // Node.js targets (like node_mksnapshot, node_mkcodecache, and cctest) and
96 // those files do not supply the symbol.
97 extern char __attribute__((weak)) __node_text_start;
98 extern char __start_lpstub;
99 } // extern "C"
100 #endif // defined(__linux__) || defined(__FreeBSD__)
101
102 #endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
103 namespace node {
104 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
105
106 namespace {
107
108 struct text_region {
109 char* from = nullptr;
110 char* to = nullptr;
111 bool found_text_region = false;
112 };
113
114 static const size_t hps = 2L * 1024 * 1024;
115
116 template <typename... Args>
Debug(std::string fmt,Args &&...args)117 inline void Debug(std::string fmt, Args&&... args) {
118 node::Debug(&per_process::enabled_debug_list,
119 DebugCategory::HUGEPAGES,
120 (std::string("Hugepages info: ") + fmt).c_str(),
121 std::forward<Args>(args)...);
122 }
123
PrintWarning(const char * warn)124 inline void PrintWarning(const char* warn) {
125 fprintf(stderr, "Hugepages WARNING: %s\n", warn);
126 }
127
PrintSystemError(int error)128 inline void PrintSystemError(int error) {
129 PrintWarning(strerror(error));
130 }
131
hugepage_align_up(uintptr_t addr)132 inline uintptr_t hugepage_align_up(uintptr_t addr) {
133 return (((addr) + (hps) - 1) & ~((hps) - 1));
134 }
135
hugepage_align_down(uintptr_t addr)136 inline uintptr_t hugepage_align_down(uintptr_t addr) {
137 return ((addr) & ~((hps) - 1));
138 }
139
140 #if defined(__linux__) || defined(__FreeBSD__)
141 #if defined(__FreeBSD__)
142 #ifndef ElfW
143 #define ElfW(name) Elf_##name
144 #endif // ifndef ElfW
145 #endif // defined(__FreeBSD__)
146
147 struct dl_iterate_params {
148 uintptr_t start = 0;
149 uintptr_t end = 0;
150 uintptr_t reference_sym = reinterpret_cast<uintptr_t>(&__node_text_start);
151 std::string exename;
152 };
153
FindMapping(struct dl_phdr_info * info,size_t,void * data)154 int FindMapping(struct dl_phdr_info* info, size_t, void* data) {
155 auto dl_params = static_cast<dl_iterate_params*>(data);
156 if (dl_params->exename == std::string(info->dlpi_name)) {
157 for (int idx = 0; idx < info->dlpi_phnum; idx++) {
158 const ElfW(Phdr)* phdr = &info->dlpi_phdr[idx];
159 if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) {
160 uintptr_t start = info->dlpi_addr + phdr->p_vaddr;
161 uintptr_t end = start + phdr->p_memsz;
162
163 if (dl_params->reference_sym >= start &&
164 dl_params->reference_sym <= end) {
165 dl_params->start = start;
166 dl_params->end = end;
167 return 1;
168 }
169 }
170 }
171 }
172 return 0;
173 }
174 #endif // defined(__linux__) || defined(__FreeBSD__)
175
FindNodeTextRegion()176 struct text_region FindNodeTextRegion() {
177 struct text_region nregion;
178 #if defined(__linux__) || defined(__FreeBSD__)
179 dl_iterate_params dl_params;
180 uintptr_t lpstub_start = reinterpret_cast<uintptr_t>(&__start_lpstub);
181
182 #if defined(__FreeBSD__)
183 // On FreeBSD we need the name of the binary, because `dl_iterate_phdr` does
184 // not pass in an empty string as the `dlpi_name` of the binary but rather its
185 // absolute path.
186 {
187 char selfexe[PATH_MAX];
188 size_t count = sizeof(selfexe);
189 if (uv_exepath(selfexe, &count))
190 return nregion;
191 dl_params.exename = std::string(selfexe, count);
192 }
193 #endif // defined(__FreeBSD__)
194
195 if (dl_iterate_phdr(FindMapping, &dl_params) == 1) {
196 Debug("start: %p - sym: %p - end: %p\n",
197 reinterpret_cast<void*>(dl_params.start),
198 reinterpret_cast<void*>(dl_params.reference_sym),
199 reinterpret_cast<void*>(dl_params.end));
200
201 dl_params.start = dl_params.reference_sym;
202 if (lpstub_start > dl_params.start && lpstub_start <= dl_params.end) {
203 Debug("Trimming end for lpstub: %p\n",
204 reinterpret_cast<void*>(lpstub_start));
205 dl_params.end = lpstub_start;
206 }
207
208 if (dl_params.start < dl_params.end) {
209 char* from = reinterpret_cast<char*>(hugepage_align_up(dl_params.start));
210 char* to = reinterpret_cast<char*>(hugepage_align_down(dl_params.end));
211 Debug("Aligned range is %p - %p\n", from, to);
212 if (from < to) {
213 size_t pagecount = (to - from) / hps;
214 if (pagecount > 0) {
215 nregion.found_text_region = true;
216 nregion.from = from;
217 nregion.to = to;
218 }
219 }
220 }
221 }
222 #elif defined(__APPLE__)
223 struct vm_region_submap_info_64 map;
224 mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
225 vm_address_t addr = 0UL;
226 vm_size_t size = 0;
227 natural_t depth = 1;
228
229 while (true) {
230 if (vm_region_recurse_64(mach_task_self(), &addr, &size, &depth,
231 reinterpret_cast<vm_region_info_64_t>(&map),
232 &count) != KERN_SUCCESS) {
233 break;
234 }
235
236 if (map.is_submap) {
237 depth++;
238 } else {
239 char* start = reinterpret_cast<char*>(hugepage_align_up(addr));
240 char* end = reinterpret_cast<char*>(hugepage_align_down(addr+size));
241
242 if (end > start && (map.protection & VM_PROT_READ) != 0 &&
243 (map.protection & VM_PROT_EXECUTE) != 0) {
244 nregion.found_text_region = true;
245 nregion.from = start;
246 nregion.to = end;
247 break;
248 }
249
250 addr += size;
251 size = 0;
252 }
253 }
254 #endif
255 Debug("Found %d huge pages\n", (nregion.to - nregion.from) / hps);
256 return nregion;
257 }
258
259 #if defined(__linux__)
IsTransparentHugePagesEnabled()260 bool IsTransparentHugePagesEnabled() {
261 // File format reference:
262 // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/huge_memory.c?id=13391c60da3308ed9980de0168f74cce6c62ac1d#n163
263 const char* filename = "/sys/kernel/mm/transparent_hugepage/enabled";
264 std::ifstream config_stream(filename, std::ios::in);
265 if (!config_stream.good()) {
266 PrintWarning("could not open /sys/kernel/mm/transparent_hugepage/enabled");
267 return false;
268 }
269
270 std::string token;
271 config_stream >> token;
272 if ("[always]" == token) return true;
273 config_stream >> token;
274 if ("[madvise]" == token) return true;
275 return false;
276 }
277 #elif defined(__FreeBSD__)
IsSuperPagesEnabled()278 bool IsSuperPagesEnabled() {
279 // It is enabled by default on amd64.
280 unsigned int super_pages = 0;
281 size_t super_pages_length = sizeof(super_pages);
282 return sysctlbyname("vm.pmap.pg_ps_enabled",
283 &super_pages,
284 &super_pages_length,
285 nullptr,
286 0) != -1 &&
287 super_pages >= 1;
288 }
289 #endif
290
291 // Functions in this class must always be inlined because they must end up in
292 // the `lpstub` section rather than the `.text` section.
293 class MemoryMapPointer {
294 public:
MemoryMapPointer()295 FORCE_INLINE explicit MemoryMapPointer() {}
operator ==(void * rhs) const296 FORCE_INLINE bool operator==(void* rhs) const { return mem_ == rhs; }
mem() const297 FORCE_INLINE void* mem() const { return mem_; }
298 MemoryMapPointer(const MemoryMapPointer&) = delete;
299 MemoryMapPointer(MemoryMapPointer&&) = delete;
300 void operator= (const MemoryMapPointer&) = delete;
301 void operator= (const MemoryMapPointer&&) = delete;
Reset(void * start,size_t size,int prot,int flags,int fd=-1,size_t offset=0)302 FORCE_INLINE void Reset(void* start,
303 size_t size,
304 int prot,
305 int flags,
306 int fd = -1,
307 size_t offset = 0) {
308 mem_ = mmap(start, size, prot, flags, fd, offset);
309 size_ = size;
310 }
Reset()311 FORCE_INLINE void Reset() {
312 mem_ = nullptr;
313 size_ = 0;
314 }
~MemoryMapPointer()315 FORCE_INLINE ~MemoryMapPointer() {
316 if (mem_ == nullptr) return;
317 if (mem_ == MAP_FAILED) return;
318 if (munmap(mem_, size_) == 0) return;
319 PrintSystemError(errno);
320 }
321
322 private:
323 size_t size_ = 0;
324 void* mem_ = nullptr;
325 };
326
327 } // End of anonymous namespace
328
329 int
330 #if !defined(__APPLE__)
331 __attribute__((__section__("lpstub")))
332 #else
333 __attribute__((__section__("__TEXT,__lpstub")))
334 #endif
335 __attribute__((__aligned__(hps)))
336 __attribute__((__noinline__))
MoveTextRegionToLargePages(const text_region & r)337 MoveTextRegionToLargePages(const text_region& r) {
338 MemoryMapPointer nmem;
339 MemoryMapPointer tmem;
340 void* start = r.from;
341 size_t size = r.to - r.from;
342
343 // Allocate a temporary region and back up the code we will re-map.
344 nmem.Reset(nullptr, size,
345 PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS);
346 if (nmem.mem() == MAP_FAILED) goto fail;
347 memcpy(nmem.mem(), r.from, size);
348
349 #if defined(__linux__)
350 // We already know the original page is r-xp
351 // (PROT_READ, PROT_EXEC, MAP_PRIVATE)
352 // We want PROT_WRITE because we are writing into it.
353 // We want it at the fixed address and we use MAP_FIXED.
354 tmem.Reset(start, size,
355 PROT_READ | PROT_WRITE | PROT_EXEC,
356 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED);
357 if (tmem.mem() == MAP_FAILED) goto fail;
358 if (madvise(tmem.mem(), size, 14 /* MADV_HUGEPAGE */) == -1) goto fail;
359 memcpy(start, nmem.mem(), size);
360 #elif defined(__FreeBSD__)
361 tmem.Reset(start, size,
362 PROT_READ | PROT_WRITE | PROT_EXEC,
363 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
364 MAP_ALIGNED_SUPER);
365 if (tmem.mem() == MAP_FAILED) goto fail;
366 memcpy(start, nmem.mem(), size);
367 #elif defined(__APPLE__)
368 // There is not enough room to reserve the mapping close
369 // to the region address so we content to give a hint
370 // without forcing the new address being closed to.
371 // We explicitally gives all permission since we plan
372 // to write into it.
373 tmem.Reset(start, size,
374 PROT_READ | PROT_WRITE | PROT_EXEC,
375 MAP_PRIVATE | MAP_ANONYMOUS,
376 VM_FLAGS_SUPERPAGE_SIZE_2MB);
377 if (tmem.mem() == MAP_FAILED) goto fail;
378 memcpy(tmem.mem(), nmem.mem(), size);
379 if (mprotect(start, size, PROT_READ | PROT_WRITE | PROT_EXEC) == -1)
380 goto fail;
381 memcpy(start, tmem.mem(), size);
382 #endif
383
384 if (mprotect(start, size, PROT_READ | PROT_EXEC) == -1) goto fail;
385
386 // We need not `munmap(tmem, size)` on success.
387 tmem.Reset();
388 return 0;
389 fail:
390 PrintSystemError(errno);
391 return -1;
392 }
393 #endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
394
395 // This is the primary API called from main.
MapStaticCodeToLargePages()396 int MapStaticCodeToLargePages() {
397 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
398 bool have_thp = false;
399 #if defined(__linux__)
400 have_thp = IsTransparentHugePagesEnabled();
401 #elif defined(__FreeBSD__)
402 have_thp = IsSuperPagesEnabled();
403 #elif defined(__APPLE__)
404 // pse-36 flag is present in recent mac x64 products.
405 have_thp = true;
406 #endif
407 if (!have_thp)
408 return EACCES;
409
410 struct text_region r = FindNodeTextRegion();
411 if (r.found_text_region == false)
412 return ENOENT;
413
414 return MoveTextRegionToLargePages(r);
415 #else
416 return ENOTSUP;
417 #endif
418 }
419
LargePagesError(int status)420 const char* LargePagesError(int status) {
421 switch (status) {
422 case ENOTSUP:
423 return "Mapping to large pages is not supported.";
424
425 case EACCES:
426 return "Large pages are not enabled.";
427
428 case ENOENT:
429 return "failed to find text region";
430
431 case -1:
432 return "Mapping code to large pages failed. Reverting to default page "
433 "size.";
434
435 case 0:
436 return "OK";
437
438 default:
439 return "Unknown error";
440 }
441 }
442
443 } // namespace node
444