• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"),
5 // to deal in the Software without restriction, including without limitation
6 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 // and/or sell copies of the Software, and to permit persons to whom
8 // the Software is furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included
11 // in all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
14 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
16 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
17 // OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
19 // OR OTHER DEALINGS IN THE SOFTWARE.
20 //
21 // SPDX-License-Identifier: MIT
22 
23 // The functions in this file map the .text section of Node.js into 2MB pages.
24 // They perform the following steps:
25 //
26 // 1: Find the Node.js binary's `.text` section in memory. This is done below in
27 //    `FindNodeTextRegion`. It is accomplished in a platform-specific way. On
28 //    Linux and FreeBSD, `dl_iterate_phdr(3)` is used. When the region is found,
29 //    it is "trimmed" as follows:
30 //    * Modify the start to point to the very beginning of the Node.js `.text`
31 //      section (from symbol `__node_text_start` declared in node_text_start.S).
32 //    * Possibly modify the end to account for the `lpstub` section which
33 //      contains `MoveTextRegionToLargePages`, the function we do not wish to
34 //      move (see below).
35 //    * Align the address of the start to its nearest higher large page
36 //      boundary.
37 //    * Align the address of the end to its nearest lower large page boundary.
38 //
39 // 2: Move the text region to large pages. This is done below in
40 //    `MoveTextRegionToLargePages`. We need to be very careful:
41 //    a) `MoveTextRegionToLargePages` itself should not be moved.
42 //       We use gcc attributes
43 //       (__section__) to put it outside the `.text` section,
44 //       (__aligned__) to align it at the 2M boundary, and
45 //       (__noline__) to not inline this function.
46 //    b) `MoveTextRegionToLargePages` should not call any function(s) that might
47 //       be moved.
48 //    To move the .text section, perform the following steps:
49 //      * Map a new, temporary area and copy the original code there.
50 //      * Use mmap using the start address with MAP_FIXED so we get exactly the
51 //        same virtual address (except on OSX). On platforms other than Linux,
52 //        use mmap flags to request hugepages.
53 //      * On Linux use madvise with MADV_HUGEPAGE to use anonymous 2MB pages.
54 //      * If successful copy the code to the newly mapped area and protect it to
55 //        be readable and executable.
56 //      * Unmap the temporary area.
57 
58 #include "node_large_page.h"
59 
60 #include <cerrno>   // NOLINT(build/include)
61 
62 // Besides returning ENOTSUP at runtime we do nothing if this define is missing.
63 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
64 #include "debug_utils-inl.h"
65 
66 #if defined(__linux__) || defined(__FreeBSD__)
67 #if defined(__linux__)
68 #ifndef _GNU_SOURCE
69 #define _GNU_SOURCE
70 #endif  // ifndef _GNU_SOURCE
71 #include <sys/prctl.h>
72 #if !defined(PR_SET_VMA)
73 #define PR_SET_VMA 0x53564d41
74 #define PR_SET_VMA_ANON_NAME 0
75 #endif
76 #elif defined(__FreeBSD__)
77 #include "uv.h"  // uv_exepath
78 #endif  // defined(__linux__)
79 #include <link.h>
80 #endif  // defined(__linux__) || defined(__FreeBSD__)
81 
82 #include <sys/types.h>
83 #include <sys/mman.h>
84 #if defined(__FreeBSD__)
85 #include <sys/sysctl.h>
86 #elif defined(__APPLE__)
87 #include <mach/vm_map.h>
88 #endif
89 
90 #include <climits>  // PATH_MAX
91 #include <cstdlib>
92 #include <cstdint>
93 #include <cstring>
94 #include <string>
95 #include <fstream>
96 
97 #if defined(__linux__) || defined(__FreeBSD__)
98 extern "C" {
99 // This symbol must be declared weak because this file becomes part of all
100 // Node.js targets (like node_mksnapshot, node_mkcodecache, and cctest) and
101 // those files do not supply the symbol.
102 extern char __attribute__((weak)) __node_text_start;
103 extern char __start_lpstub;
104 }  // extern "C"
105 #endif  // defined(__linux__) || defined(__FreeBSD__)
106 
107 #endif  // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
108 namespace node {
109 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
110 
111 namespace {
112 
113 struct text_region {
114   char* from = nullptr;
115   char* to = nullptr;
116   bool found_text_region = false;
117 };
118 
119 static const size_t hps = 2L * 1024 * 1024;
120 
121 template <typename... Args>
Debug(std::string fmt,Args &&...args)122 inline void Debug(std::string fmt, Args&&... args) {
123   node::Debug(&per_process::enabled_debug_list,
124               DebugCategory::HUGEPAGES,
125               (std::string("Hugepages info: ") + fmt).c_str(),
126               std::forward<Args>(args)...);
127 }
128 
PrintWarning(const char * warn)129 inline void PrintWarning(const char* warn) {
130   fprintf(stderr, "Hugepages WARNING: %s\n", warn);
131 }
132 
PrintSystemError(int error)133 inline void PrintSystemError(int error) {
134   PrintWarning(strerror(error));
135 }
136 
hugepage_align_up(uintptr_t addr)137 inline uintptr_t hugepage_align_up(uintptr_t addr) {
138   return (((addr) + (hps) - 1) & ~((hps) - 1));
139 }
140 
hugepage_align_down(uintptr_t addr)141 inline uintptr_t hugepage_align_down(uintptr_t addr) {
142   return ((addr) & ~((hps) - 1));
143 }
144 
145 #if defined(__linux__) || defined(__FreeBSD__)
146 #if defined(__FreeBSD__)
147 #ifndef ElfW
148 #define ElfW(name) Elf_##name
149 #endif  // ifndef ElfW
150 #endif  // defined(__FreeBSD__)
151 
152 struct dl_iterate_params {
153   uintptr_t start = 0;
154   uintptr_t end = 0;
155   uintptr_t reference_sym = reinterpret_cast<uintptr_t>(&__node_text_start);
156   std::string exename;
157 };
158 
FindMapping(struct dl_phdr_info * info,size_t,void * data)159 int FindMapping(struct dl_phdr_info* info, size_t, void* data) {
160   auto dl_params = static_cast<dl_iterate_params*>(data);
161   if (dl_params->exename == std::string(info->dlpi_name)) {
162     for (int idx = 0; idx < info->dlpi_phnum; idx++) {
163       const ElfW(Phdr)* phdr = &info->dlpi_phdr[idx];
164       if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) {
165         uintptr_t start = info->dlpi_addr + phdr->p_vaddr;
166         uintptr_t end = start + phdr->p_memsz;
167 
168         if (dl_params->reference_sym >= start &&
169             dl_params->reference_sym <= end) {
170           dl_params->start = start;
171           dl_params->end = end;
172           return 1;
173         }
174       }
175     }
176   }
177   return 0;
178 }
179 #endif  // defined(__linux__) || defined(__FreeBSD__)
180 
FindNodeTextRegion()181 struct text_region FindNodeTextRegion() {
182   struct text_region nregion;
183 #if defined(__linux__) || defined(__FreeBSD__)
184   dl_iterate_params dl_params;
185   uintptr_t lpstub_start = reinterpret_cast<uintptr_t>(&__start_lpstub);
186 
187 #if defined(__FreeBSD__)
188   // On FreeBSD we need the name of the binary, because `dl_iterate_phdr` does
189   // not pass in an empty string as the `dlpi_name` of the binary but rather its
190   // absolute path.
191   {
192     char selfexe[PATH_MAX];
193     size_t count = sizeof(selfexe);
194     if (uv_exepath(selfexe, &count))
195       return nregion;
196     dl_params.exename = std::string(selfexe, count);
197   }
198 #endif  // defined(__FreeBSD__)
199 
200   if (dl_iterate_phdr(FindMapping, &dl_params) == 1) {
201     Debug("start: %p - sym: %p - end: %p\n",
202           reinterpret_cast<void*>(dl_params.start),
203           reinterpret_cast<void*>(dl_params.reference_sym),
204           reinterpret_cast<void*>(dl_params.end));
205 
206     dl_params.start = dl_params.reference_sym;
207     if (lpstub_start > dl_params.start && lpstub_start <= dl_params.end) {
208       Debug("Trimming end for lpstub: %p\n",
209             reinterpret_cast<void*>(lpstub_start));
210       dl_params.end = lpstub_start;
211     }
212 
213     if (dl_params.start < dl_params.end) {
214       char* from = reinterpret_cast<char*>(hugepage_align_up(dl_params.start));
215       char* to = reinterpret_cast<char*>(hugepage_align_down(dl_params.end));
216       Debug("Aligned range is %p - %p\n", from, to);
217       if (from < to) {
218         size_t pagecount = (to - from) / hps;
219         if (pagecount > 0) {
220           nregion.found_text_region = true;
221           nregion.from = from;
222           nregion.to = to;
223         }
224       }
225     }
226   }
227 #elif defined(__APPLE__)
228   struct vm_region_submap_info_64 map;
229   mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
230   vm_address_t addr = 0UL;
231   vm_size_t size = 0;
232   natural_t depth = 1;
233 
234   while (true) {
235     if (vm_region_recurse_64(mach_task_self(), &addr, &size, &depth,
236                              reinterpret_cast<vm_region_info_64_t>(&map),
237                              &count) != KERN_SUCCESS) {
238       break;
239     }
240 
241     if (map.is_submap) {
242       depth++;
243     } else {
244       char* start = reinterpret_cast<char*>(hugepage_align_up(addr));
245       char* end = reinterpret_cast<char*>(hugepage_align_down(addr+size));
246 
247       if (end > start && (map.protection & VM_PROT_READ) != 0 &&
248           (map.protection & VM_PROT_EXECUTE) != 0) {
249         nregion.found_text_region = true;
250         nregion.from = start;
251         nregion.to = end;
252         break;
253       }
254 
255       addr += size;
256       size = 0;
257     }
258   }
259 #endif
260   Debug("Found %d huge pages\n", (nregion.to - nregion.from) / hps);
261   return nregion;
262 }
263 
264 #if defined(__linux__)
IsTransparentHugePagesEnabled()265 bool IsTransparentHugePagesEnabled() {
266   // File format reference:
267   // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/huge_memory.c?id=13391c60da3308ed9980de0168f74cce6c62ac1d#n163
268   const char* filename = "/sys/kernel/mm/transparent_hugepage/enabled";
269   std::ifstream config_stream(filename, std::ios::in);
270   if (!config_stream.good()) {
271     PrintWarning("could not open /sys/kernel/mm/transparent_hugepage/enabled");
272     return false;
273   }
274 
275   std::string token;
276   config_stream >> token;
277   if ("[always]" == token) return true;
278   config_stream >> token;
279   if ("[madvise]" == token) return true;
280   return false;
281 }
282 #elif defined(__FreeBSD__)
IsSuperPagesEnabled()283 bool IsSuperPagesEnabled() {
284   // It is enabled by default on amd64.
285   unsigned int super_pages = 0;
286   size_t super_pages_length = sizeof(super_pages);
287   return sysctlbyname("vm.pmap.pg_ps_enabled",
288                       &super_pages,
289                       &super_pages_length,
290                       nullptr,
291                       0) != -1 &&
292          super_pages >= 1;
293 }
294 #endif
295 
296 // Functions in this class must always be inlined because they must end up in
297 // the `lpstub` section rather than the `.text` section.
298 class MemoryMapPointer {
299  public:
MemoryMapPointer()300   FORCE_INLINE explicit MemoryMapPointer() {}
operator ==(void * rhs) const301   FORCE_INLINE bool operator==(void* rhs) const { return mem_ == rhs; }
mem() const302   FORCE_INLINE void* mem() const { return mem_; }
303   MemoryMapPointer(const MemoryMapPointer&) = delete;
304   MemoryMapPointer(MemoryMapPointer&&) = delete;
305   void operator= (const MemoryMapPointer&) = delete;
306   void operator= (const MemoryMapPointer&&) = delete;
Reset(void * start,size_t size,int prot,int flags,int fd=-1,size_t offset=0)307   FORCE_INLINE void Reset(void* start,
308                           size_t size,
309                           int prot,
310                           int flags,
311                           int fd = -1,
312                           size_t offset = 0) {
313     mem_ = mmap(start, size, prot, flags, fd, offset);
314     size_ = size;
315   }
Reset()316   FORCE_INLINE void Reset() {
317     mem_ = nullptr;
318     size_ = 0;
319   }
SetName(void * mem,size_t size,const char * name)320   static void SetName(void* mem, size_t size, const char* name) {
321 #if defined(__linux__)
322     // Available since the 5.17 kernel release and if the
323     // CONFIG_ANON_VMA_NAME option, we can set an identifier
324     // to an anonymous mapped region. However if the kernel
325     // option is not present or it s an older kernel, it is a no-op.
326     if (mem != MAP_FAILED && mem != nullptr)
327         prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME,
328             reinterpret_cast<uintptr_t>(mem),
329             size,
330             reinterpret_cast<uintptr_t>(name));
331 #else
332     (void)name;
333 #endif
334   }
~MemoryMapPointer()335   FORCE_INLINE ~MemoryMapPointer() {
336     if (mem_ == nullptr) return;
337     if (mem_ == MAP_FAILED) return;
338     if (munmap(mem_, size_) == 0) return;
339     PrintSystemError(errno);
340   }
341 
342  private:
343   size_t size_ = 0;
344   void* mem_ = nullptr;
345 };
346 
347 }  // End of anonymous namespace
348 
349 int
350 #if !defined(__APPLE__)
351 __attribute__((__section__("lpstub")))
352 #else
353 __attribute__((__section__("__TEXT,__lpstub")))
354 #endif
355 __attribute__((__aligned__(hps)))
356 __attribute__((__noinline__))
MoveTextRegionToLargePages(const text_region & r)357 MoveTextRegionToLargePages(const text_region& r) {
358   MemoryMapPointer nmem;
359   MemoryMapPointer tmem;
360   void* start = r.from;
361   size_t size = r.to - r.from;
362 
363   // Allocate a temporary region and back up the code we will re-map.
364   nmem.Reset(nullptr, size,
365              PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS);
366   if (nmem.mem() == MAP_FAILED) goto fail;
367   memcpy(nmem.mem(), r.from, size);
368 
369 #if defined(__linux__)
370 // We already know the original page is r-xp
371 // (PROT_READ, PROT_EXEC, MAP_PRIVATE)
372 // We want PROT_WRITE because we are writing into it.
373 // We want it at the fixed address and we use MAP_FIXED.
374   tmem.Reset(start, size,
375              PROT_READ | PROT_WRITE | PROT_EXEC,
376              MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED);
377   if (tmem.mem() == MAP_FAILED) goto fail;
378   if (madvise(tmem.mem(), size, 14 /* MADV_HUGEPAGE */) == -1) goto fail;
379   memcpy(start, nmem.mem(), size);
380 #elif defined(__FreeBSD__)
381   tmem.Reset(start, size,
382              PROT_READ | PROT_WRITE | PROT_EXEC,
383              MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
384              MAP_ALIGNED_SUPER);
385   if (tmem.mem() == MAP_FAILED) goto fail;
386   memcpy(start, nmem.mem(), size);
387 #elif defined(__APPLE__)
388   // There is not enough room to reserve the mapping close
389   // to the region address so we content to give a hint
390   // without forcing the new address being closed to.
391   // We explicitally gives all permission since we plan
392   // to write into it.
393   tmem.Reset(start, size,
394              PROT_READ | PROT_WRITE | PROT_EXEC,
395              MAP_PRIVATE | MAP_ANONYMOUS,
396              VM_FLAGS_SUPERPAGE_SIZE_2MB);
397   if (tmem.mem() == MAP_FAILED) goto fail;
398   memcpy(tmem.mem(), nmem.mem(), size);
399   if (mprotect(start, size, PROT_READ | PROT_WRITE | PROT_EXEC) == -1)
400     goto fail;
401   memcpy(start, tmem.mem(), size);
402 #endif
403 
404   if (mprotect(start, size, PROT_READ | PROT_EXEC) == -1) goto fail;
405   MemoryMapPointer::SetName(start, size, "nodejs Large Page");
406 
407   // We need not `munmap(tmem, size)` on success.
408   tmem.Reset();
409   return 0;
410 fail:
411   PrintSystemError(errno);
412   return -1;
413 }
414 #endif  // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
415 
416 // This is the primary API called from main.
MapStaticCodeToLargePages()417 int MapStaticCodeToLargePages() {
418 #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
419   bool have_thp = false;
420 #if defined(__linux__)
421   have_thp = IsTransparentHugePagesEnabled();
422 #elif defined(__FreeBSD__)
423   have_thp = IsSuperPagesEnabled();
424 #elif defined(__APPLE__)
425   // pse-36 flag is present in recent mac x64 products.
426   have_thp = true;
427 #endif
428   if (!have_thp)
429     return EACCES;
430 
431   struct text_region r = FindNodeTextRegion();
432   if (r.found_text_region == false)
433     return ENOENT;
434 
435   return MoveTextRegionToLargePages(r);
436 #else
437   return ENOTSUP;
438 #endif
439 }
440 
LargePagesError(int status)441 const char* LargePagesError(int status) {
442   switch (status) {
443     case ENOTSUP:
444       return "Mapping to large pages is not supported.";
445 
446     case EACCES:
447       return "Large pages are not enabled.";
448 
449     case ENOENT:
450       return "failed to find text region";
451 
452     case -1:
453       return "Mapping code to large pages failed. Reverting to default page "
454           "size.";
455 
456     case 0:
457       return "OK";
458 
459     default:
460       return "Unknown error";
461   }
462 }
463 
464 }  // namespace node
465