/src/node/src/large_pages/node_large_page.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2018 Intel Corporation |
2 | | // |
3 | | // Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | | // of this software and associated documentation files (the "Software"), |
5 | | // to deal in the Software without restriction, including without limitation |
6 | | // the rights to use, copy, modify, merge, publish, distribute, sublicense, |
7 | | // and/or sell copies of the Software, and to permit persons to whom |
8 | | // the Software is furnished to do so, subject to the following conditions: |
9 | | // |
10 | | // The above copyright notice and this permission notice shall be included |
11 | | // in all copies or substantial portions of the Software. |
12 | | // |
13 | | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
14 | | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
16 | | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES |
17 | | // OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
18 | | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE |
19 | | // OR OTHER DEALINGS IN THE SOFTWARE. |
20 | | // |
21 | | // SPDX-License-Identifier: MIT |
22 | | |
23 | | // The functions in this file map the .text section of Node.js into 2MB pages. |
24 | | // They perform the following steps: |
25 | | // |
26 | | // 1: Find the Node.js binary's `.text` section in memory. This is done below in |
27 | | // `FindNodeTextRegion`. It is accomplished in a platform-specific way. On |
28 | | // Linux and FreeBSD, `dl_iterate_phdr(3)` is used. When the region is found, |
29 | | // it is "trimmed" as follows: |
30 | | // * Modify the start to point to the very beginning of the Node.js `.text` |
31 | | // section (from symbol `__node_text_start` declared in node_text_start.S). |
32 | | // * Possibly modify the end to account for the `lpstub` section which |
33 | | // contains `MoveTextRegionToLargePages`, the function we do not wish to |
34 | | // move (see below). |
35 | | // * Align the address of the start to its nearest higher large page |
36 | | // boundary. |
37 | | // * Align the address of the end to its nearest lower large page boundary. |
38 | | // |
39 | | // 2: Move the text region to large pages. This is done below in |
40 | | // `MoveTextRegionToLargePages`. We need to be very careful: |
41 | | // a) `MoveTextRegionToLargePages` itself should not be moved. |
42 | | // We use gcc attributes |
43 | | // (__section__) to put it outside the `.text` section, |
44 | | // (__aligned__) to align it at the 2M boundary, and |
45 | | // (__noline__) to not inline this function. |
46 | | // b) `MoveTextRegionToLargePages` should not call any function(s) that might |
47 | | // be moved. |
48 | | // To move the .text section, perform the following steps: |
49 | | // * Map a new, temporary area and copy the original code there. |
50 | | // * Use mmap using the start address with MAP_FIXED so we get exactly the |
51 | | // same virtual address (except on OSX). On platforms other than Linux, |
52 | | // use mmap flags to request hugepages. |
53 | | // * On Linux use madvise with MADV_HUGEPAGE to use anonymous 2MB pages. |
54 | | // * If successful copy the code to the newly mapped area and protect it to |
55 | | // be readable and executable. |
56 | | // * Unmap the temporary area. |
57 | | |
58 | | #include "node_large_page.h" |
59 | | |
60 | | #include <cerrno> // NOLINT(build/include) |
61 | | |
62 | | // Besides returning ENOTSUP at runtime we do nothing if this define is missing. |
63 | | #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES |
64 | | #include "debug_utils-inl.h" |
65 | | |
66 | | #if defined(__linux__) || defined(__FreeBSD__) |
67 | | #if defined(__linux__) |
68 | | #ifndef _GNU_SOURCE |
69 | | #define _GNU_SOURCE |
70 | | #endif // ifndef _GNU_SOURCE |
71 | | #include <sys/prctl.h> |
72 | | #if !defined(PR_SET_VMA) |
73 | 0 | #define PR_SET_VMA 0x53564d41 |
74 | 0 | #define PR_SET_VMA_ANON_NAME 0 |
75 | | #endif |
76 | | #elif defined(__FreeBSD__) |
77 | | #include "uv.h" // uv_exepath |
78 | | #endif // defined(__linux__) |
79 | | #include <link.h> |
80 | | #endif // defined(__linux__) || defined(__FreeBSD__) |
81 | | |
82 | | #include <sys/types.h> |
83 | | #include <sys/mman.h> |
84 | | #if defined(__FreeBSD__) |
85 | | #include <sys/sysctl.h> |
86 | | #elif defined(__APPLE__) |
87 | | #include <mach/vm_map.h> |
88 | | #endif |
89 | | |
90 | | #include <climits> // PATH_MAX |
91 | | #include <cstdlib> |
92 | | #include <cstdint> |
93 | | #include <cstring> |
94 | | #include <string> |
95 | | #include <fstream> |
96 | | |
97 | | #if defined(__linux__) || defined(__FreeBSD__) |
98 | | extern "C" { |
99 | | // This symbol must be declared weak because this file becomes part of all |
100 | | // Node.js targets (like node_mksnapshot, node_mkcodecache, and cctest) and |
101 | | // those files do not supply the symbol. |
102 | | extern char __attribute__((weak)) __node_text_start; |
103 | | extern char __start_lpstub; |
104 | | } // extern "C" |
105 | | #endif // defined(__linux__) || defined(__FreeBSD__) |
106 | | |
107 | | #endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES |
108 | | namespace node { |
109 | | #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES |
110 | | |
111 | | namespace { |
112 | | |
113 | | struct text_region { |
114 | | char* from = nullptr; |
115 | | char* to = nullptr; |
116 | | bool found_text_region = false; |
117 | | }; |
118 | | |
119 | | static const size_t hps = 2L * 1024 * 1024; |
120 | | |
121 | | template <typename... Args> |
122 | 0 | inline void Debug(std::string fmt, Args&&... args) { |
123 | 0 | node::Debug(&per_process::enabled_debug_list, |
124 | 0 | DebugCategory::HUGEPAGES, |
125 | 0 | (std::string("Hugepages info: ") + fmt).c_str(), |
126 | 0 | std::forward<Args>(args)...); |
127 | 0 | } Unexecuted instantiation: node_large_page.cc:void node::(anonymous namespace)::Debug<void*, void*, void*>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, void*&&, void*&&, void*&&) Unexecuted instantiation: node_large_page.cc:void node::(anonymous namespace)::Debug<void*>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, void*&&) Unexecuted instantiation: node_large_page.cc:void node::(anonymous namespace)::Debug<char*&, char*&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, char*&, char*&) Unexecuted instantiation: node_large_page.cc:void node::(anonymous namespace)::Debug<unsigned long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, unsigned long&&) |
128 | | |
129 | 0 | inline void PrintWarning(const char* warn) { |
130 | 0 | fprintf(stderr, "Hugepages WARNING: %s\n", warn); |
131 | 0 | } |
132 | | |
133 | 0 | inline void PrintSystemError(int error) { |
134 | 0 | PrintWarning(strerror(error)); |
135 | 0 | } |
136 | | |
137 | 0 | inline uintptr_t hugepage_align_up(uintptr_t addr) { |
138 | 0 | return (((addr) + (hps) - 1) & ~((hps) - 1)); |
139 | 0 | } |
140 | | |
141 | 0 | inline uintptr_t hugepage_align_down(uintptr_t addr) { |
142 | 0 | return ((addr) & ~((hps) - 1)); |
143 | 0 | } |
144 | | |
145 | | #if defined(__linux__) || defined(__FreeBSD__) |
146 | | #if defined(__FreeBSD__) |
147 | | #ifndef ElfW |
148 | | #define ElfW(name) Elf_##name |
149 | | #endif // ifndef ElfW |
150 | | #endif // defined(__FreeBSD__) |
151 | | |
152 | | struct dl_iterate_params { |
153 | | uintptr_t start = 0; |
154 | | uintptr_t end = 0; |
155 | | uintptr_t reference_sym = reinterpret_cast<uintptr_t>(&__node_text_start); |
156 | | std::string exename; |
157 | | }; |
158 | | |
159 | 0 | int FindMapping(struct dl_phdr_info* info, size_t, void* data) { |
160 | 0 | auto dl_params = static_cast<dl_iterate_params*>(data); |
161 | 0 | if (dl_params->exename == std::string(info->dlpi_name)) { |
162 | 0 | for (int idx = 0; idx < info->dlpi_phnum; idx++) { |
163 | 0 | const ElfW(Phdr)* phdr = &info->dlpi_phdr[idx]; |
164 | 0 | if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) { |
165 | 0 | uintptr_t start = info->dlpi_addr + phdr->p_vaddr; |
166 | 0 | uintptr_t end = start + phdr->p_memsz; |
167 | |
|
168 | 0 | if (dl_params->reference_sym >= start && |
169 | 0 | dl_params->reference_sym <= end) { |
170 | 0 | dl_params->start = start; |
171 | 0 | dl_params->end = end; |
172 | 0 | return 1; |
173 | 0 | } |
174 | 0 | } |
175 | 0 | } |
176 | 0 | } |
177 | 0 | return 0; |
178 | 0 | } |
179 | | #endif // defined(__linux__) || defined(__FreeBSD__) |
180 | | |
181 | 0 | struct text_region FindNodeTextRegion() { |
182 | 0 | struct text_region nregion; |
183 | 0 | #if defined(__linux__) || defined(__FreeBSD__) |
184 | 0 | dl_iterate_params dl_params; |
185 | 0 | uintptr_t lpstub_start = reinterpret_cast<uintptr_t>(&__start_lpstub); |
186 | |
|
187 | | #if defined(__FreeBSD__) |
188 | | // On FreeBSD we need the name of the binary, because `dl_iterate_phdr` does |
189 | | // not pass in an empty string as the `dlpi_name` of the binary but rather its |
190 | | // absolute path. |
191 | | { |
192 | | char selfexe[PATH_MAX]; |
193 | | size_t count = sizeof(selfexe); |
194 | | if (uv_exepath(selfexe, &count)) |
195 | | return nregion; |
196 | | dl_params.exename = std::string(selfexe, count); |
197 | | } |
198 | | #endif // defined(__FreeBSD__) |
199 | |
|
200 | 0 | if (dl_iterate_phdr(FindMapping, &dl_params) == 1) { |
201 | 0 | Debug("start: %p - sym: %p - end: %p\n", |
202 | 0 | reinterpret_cast<void*>(dl_params.start), |
203 | 0 | reinterpret_cast<void*>(dl_params.reference_sym), |
204 | 0 | reinterpret_cast<void*>(dl_params.end)); |
205 | |
|
206 | 0 | dl_params.start = dl_params.reference_sym; |
207 | 0 | if (lpstub_start > dl_params.start && lpstub_start <= dl_params.end) { |
208 | 0 | Debug("Trimming end for lpstub: %p\n", |
209 | 0 | reinterpret_cast<void*>(lpstub_start)); |
210 | 0 | dl_params.end = lpstub_start; |
211 | 0 | } |
212 | |
|
213 | 0 | if (dl_params.start < dl_params.end) { |
214 | 0 | char* from = reinterpret_cast<char*>(hugepage_align_up(dl_params.start)); |
215 | 0 | char* to = reinterpret_cast<char*>(hugepage_align_down(dl_params.end)); |
216 | 0 | Debug("Aligned range is %p - %p\n", from, to); |
217 | 0 | if (from < to) { |
218 | 0 | size_t pagecount = (to - from) / hps; |
219 | 0 | if (pagecount > 0) { |
220 | 0 | nregion.found_text_region = true; |
221 | 0 | nregion.from = from; |
222 | 0 | nregion.to = to; |
223 | 0 | } |
224 | 0 | } |
225 | 0 | } |
226 | 0 | } |
227 | | #elif defined(__APPLE__) |
228 | | struct vm_region_submap_info_64 map; |
229 | | mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; |
230 | | vm_address_t addr = 0UL; |
231 | | vm_size_t size = 0; |
232 | | natural_t depth = 1; |
233 | | |
234 | | while (true) { |
235 | | if (vm_region_recurse_64(mach_task_self(), &addr, &size, &depth, |
236 | | reinterpret_cast<vm_region_info_64_t>(&map), |
237 | | &count) != KERN_SUCCESS) { |
238 | | break; |
239 | | } |
240 | | |
241 | | if (map.is_submap) { |
242 | | depth++; |
243 | | } else { |
244 | | char* start = reinterpret_cast<char*>(hugepage_align_up(addr)); |
245 | | char* end = reinterpret_cast<char*>(hugepage_align_down(addr+size)); |
246 | | |
247 | | if (end > start && (map.protection & VM_PROT_READ) != 0 && |
248 | | (map.protection & VM_PROT_EXECUTE) != 0) { |
249 | | nregion.found_text_region = true; |
250 | | nregion.from = start; |
251 | | nregion.to = end; |
252 | | break; |
253 | | } |
254 | | |
255 | | addr += size; |
256 | | size = 0; |
257 | | } |
258 | | } |
259 | | #endif |
260 | 0 | Debug("Found %d huge pages\n", (nregion.to - nregion.from) / hps); |
261 | 0 | return nregion; |
262 | 0 | } |
263 | | |
264 | | #if defined(__linux__) |
265 | 0 | bool IsTransparentHugePagesEnabled() { |
266 | | // File format reference: |
267 | | // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/huge_memory.c?id=13391c60da3308ed9980de0168f74cce6c62ac1d#n163 |
268 | 0 | const char* filename = "/sys/kernel/mm/transparent_hugepage/enabled"; |
269 | 0 | std::ifstream config_stream(filename, std::ios::in); |
270 | 0 | if (!config_stream.good()) { |
271 | 0 | PrintWarning("could not open /sys/kernel/mm/transparent_hugepage/enabled"); |
272 | 0 | return false; |
273 | 0 | } |
274 | | |
275 | 0 | std::string token; |
276 | 0 | config_stream >> token; |
277 | 0 | if ("[always]" == token) return true; |
278 | 0 | config_stream >> token; |
279 | 0 | if ("[madvise]" == token) return true; |
280 | 0 | return false; |
281 | 0 | } |
282 | | #elif defined(__FreeBSD__) |
283 | | bool IsSuperPagesEnabled() { |
284 | | // It is enabled by default on amd64. |
285 | | unsigned int super_pages = 0; |
286 | | size_t super_pages_length = sizeof(super_pages); |
287 | | return sysctlbyname("vm.pmap.pg_ps_enabled", |
288 | | &super_pages, |
289 | | &super_pages_length, |
290 | | nullptr, |
291 | | 0) != -1 && |
292 | | super_pages >= 1; |
293 | | } |
294 | | #endif |
295 | | |
296 | | // Functions in this class must always be inlined because they must end up in |
297 | | // the `lpstub` section rather than the `.text` section. |
298 | | class MemoryMapPointer { |
299 | | public: |
300 | 0 | FORCE_INLINE explicit MemoryMapPointer() {} |
301 | 0 | FORCE_INLINE bool operator==(void* rhs) const { return mem_ == rhs; } |
302 | 0 | FORCE_INLINE void* mem() const { return mem_; } |
303 | | MemoryMapPointer(const MemoryMapPointer&) = delete; |
304 | | MemoryMapPointer(MemoryMapPointer&&) = delete; |
305 | | void operator= (const MemoryMapPointer&) = delete; |
306 | | void operator= (const MemoryMapPointer&&) = delete; |
307 | | FORCE_INLINE void Reset(void* start, |
308 | | size_t size, |
309 | | int prot, |
310 | | int flags, |
311 | | int fd = -1, |
312 | 0 | size_t offset = 0) { |
313 | 0 | mem_ = mmap(start, size, prot, flags, fd, offset); |
314 | 0 | size_ = size; |
315 | 0 | } |
316 | 0 | FORCE_INLINE void Reset() { |
317 | 0 | mem_ = nullptr; |
318 | 0 | size_ = 0; |
319 | 0 | } |
320 | 0 | static void SetName(void* mem, size_t size, const char* name) { |
321 | 0 | #if defined(__linux__) |
322 | | // Available since the 5.17 kernel release and if the |
323 | | // CONFIG_ANON_VMA_NAME option, we can set an identifier |
324 | | // to an anonymous mapped region. However if the kernel |
325 | | // option is not present or it s an older kernel, it is a no-op. |
326 | 0 | if (mem != MAP_FAILED && mem != nullptr) |
327 | 0 | prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, |
328 | 0 | reinterpret_cast<uintptr_t>(mem), |
329 | 0 | size, |
330 | 0 | reinterpret_cast<uintptr_t>(name)); |
331 | | #else |
332 | | (void)name; |
333 | | #endif |
334 | 0 | } |
335 | 0 | FORCE_INLINE ~MemoryMapPointer() { |
336 | 0 | if (mem_ == nullptr) return; |
337 | 0 | if (mem_ == MAP_FAILED) return; |
338 | 0 | if (munmap(mem_, size_) == 0) return; |
339 | 0 | PrintSystemError(errno); |
340 | 0 | } |
341 | | |
342 | | private: |
343 | | size_t size_ = 0; |
344 | | void* mem_ = nullptr; |
345 | | }; |
346 | | |
347 | | } // End of anonymous namespace |
348 | | |
349 | | int |
350 | | #if !defined(__APPLE__) |
351 | | __attribute__((__section__("lpstub"))) |
352 | | #else |
353 | | __attribute__((__section__("__TEXT,__lpstub"))) |
354 | | #endif |
355 | | __attribute__((__aligned__(hps))) |
356 | | __attribute__((__noinline__)) |
357 | 0 | MoveTextRegionToLargePages(const text_region& r) { |
358 | 0 | MemoryMapPointer nmem; |
359 | 0 | MemoryMapPointer tmem; |
360 | 0 | void* start = r.from; |
361 | 0 | size_t size = r.to - r.from; |
362 | | |
363 | | // Allocate a temporary region and back up the code we will re-map. |
364 | 0 | nmem.Reset(nullptr, size, |
365 | 0 | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS); |
366 | 0 | if (nmem.mem() == MAP_FAILED) goto fail; |
367 | 0 | memcpy(nmem.mem(), r.from, size); |
368 | |
|
369 | 0 | #if defined(__linux__) |
370 | | // We already know the original page is r-xp |
371 | | // (PROT_READ, PROT_EXEC, MAP_PRIVATE) |
372 | | // We want PROT_WRITE because we are writing into it. |
373 | | // We want it at the fixed address and we use MAP_FIXED. |
374 | 0 | tmem.Reset(start, size, |
375 | 0 | PROT_READ | PROT_WRITE | PROT_EXEC, |
376 | 0 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED); |
377 | 0 | if (tmem.mem() == MAP_FAILED) goto fail; |
378 | 0 | if (madvise(tmem.mem(), size, 14 /* MADV_HUGEPAGE */) == -1) goto fail; |
379 | 0 | memcpy(start, nmem.mem(), size); |
380 | | #elif defined(__FreeBSD__) |
381 | | tmem.Reset(start, size, |
382 | | PROT_READ | PROT_WRITE | PROT_EXEC, |
383 | | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | |
384 | | MAP_ALIGNED_SUPER); |
385 | | if (tmem.mem() == MAP_FAILED) goto fail; |
386 | | memcpy(start, nmem.mem(), size); |
387 | | #elif defined(__APPLE__) |
388 | | // There is not enough room to reserve the mapping close |
389 | | // to the region address so we content to give a hint |
390 | | // without forcing the new address being closed to. |
391 | | // We explicitally gives all permission since we plan |
392 | | // to write into it. |
393 | | tmem.Reset(start, size, |
394 | | PROT_READ | PROT_WRITE | PROT_EXEC, |
395 | | MAP_PRIVATE | MAP_ANONYMOUS, |
396 | | VM_FLAGS_SUPERPAGE_SIZE_2MB); |
397 | | if (tmem.mem() == MAP_FAILED) goto fail; |
398 | | memcpy(tmem.mem(), nmem.mem(), size); |
399 | | if (mprotect(start, size, PROT_READ | PROT_WRITE | PROT_EXEC) == -1) |
400 | | goto fail; |
401 | | memcpy(start, tmem.mem(), size); |
402 | | #endif |
403 | |
|
404 | 0 | if (mprotect(start, size, PROT_READ | PROT_EXEC) == -1) goto fail; |
405 | 0 | MemoryMapPointer::SetName(start, size, "nodejs Large Page"); |
406 | | |
407 | | // We need not `munmap(tmem, size)` on success. |
408 | 0 | tmem.Reset(); |
409 | 0 | return 0; |
410 | 0 | fail: |
411 | 0 | PrintSystemError(errno); |
412 | 0 | return -1; |
413 | 0 | } |
414 | | #endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES |
415 | | |
416 | | // This is the primary API called from main. |
417 | 0 | int MapStaticCodeToLargePages() { |
418 | 0 | #if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES |
419 | 0 | bool have_thp = false; |
420 | 0 | #if defined(__linux__) |
421 | 0 | have_thp = IsTransparentHugePagesEnabled(); |
422 | | #elif defined(__FreeBSD__) |
423 | | have_thp = IsSuperPagesEnabled(); |
424 | | #elif defined(__APPLE__) |
425 | | // pse-36 flag is present in recent mac x64 products. |
426 | | have_thp = true; |
427 | | #endif |
428 | 0 | if (!have_thp) |
429 | 0 | return EACCES; |
430 | | |
431 | 0 | struct text_region r = FindNodeTextRegion(); |
432 | 0 | if (r.found_text_region == false) |
433 | 0 | return ENOENT; |
434 | | |
435 | 0 | return MoveTextRegionToLargePages(r); |
436 | | #else |
437 | | return ENOTSUP; |
438 | | #endif |
439 | 0 | } |
440 | | |
441 | 0 | const char* LargePagesError(int status) { |
442 | 0 | switch (status) { |
443 | 0 | case ENOTSUP: |
444 | 0 | return "Mapping to large pages is not supported."; |
445 | | |
446 | 0 | case EACCES: |
447 | 0 | return "Large pages are not enabled."; |
448 | | |
449 | 0 | case ENOENT: |
450 | 0 | return "failed to find text region"; |
451 | | |
452 | 0 | case -1: |
453 | 0 | return "Mapping code to large pages failed. Reverting to default page " |
454 | 0 | "size."; |
455 | | |
456 | 0 | case 0: |
457 | 0 | return "OK"; |
458 | | |
459 | 0 | default: |
460 | 0 | return "Unknown error"; |
461 | 0 | } |
462 | 0 | } |
463 | | |
464 | | } // namespace node |