/src/cpython/Objects/mimalloc/prim/unix/prim.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ---------------------------------------------------------------------------- |
2 | | Copyright (c) 2018-2023, Microsoft Research, Daan Leijen |
3 | | This is free software; you can redistribute it and/or modify it under the |
4 | | terms of the MIT license. A copy of the license can be found in the file |
5 | | "LICENSE" at the root of this distribution. |
6 | | -----------------------------------------------------------------------------*/ |
7 | | |
8 | | // This file is included in `src/prim/prim.c` |
9 | | |
10 | | #ifndef _DEFAULT_SOURCE |
11 | | #define _DEFAULT_SOURCE // ensure mmap flags and syscall are defined |
12 | | #endif |
13 | | |
14 | | #if defined(__sun) |
15 | | // illumos provides new mman.h api when any of these are defined |
16 | | // otherwise the old api based on caddr_t which predates the void pointers one. |
17 | | // stock solaris provides only the former, chose to atomically to discard those |
18 | | // flags only here rather than project wide tough. |
19 | | #undef _XOPEN_SOURCE |
20 | | #undef _POSIX_C_SOURCE |
21 | | #endif |
22 | | |
23 | | #include "mimalloc.h" |
24 | | #include "mimalloc/internal.h" |
25 | | #include "mimalloc/atomic.h" |
26 | | #include "mimalloc/prim.h" |
27 | | |
28 | | #include <sys/mman.h> // mmap |
29 | | #include <unistd.h> // sysconf |
30 | | #include <fcntl.h> // open, close, read, access |
31 | | |
32 | | #if defined(__linux__) |
33 | | #include <features.h> |
34 | | #include <fcntl.h> |
35 | | #if defined(__GLIBC__) |
36 | | #include <linux/mman.h> // linux mmap flags |
37 | | #else |
38 | | #include <sys/mman.h> |
39 | | #endif |
40 | | #elif defined(__APPLE__) |
41 | | #include <TargetConditionals.h> |
42 | | #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR |
43 | | #include <mach/vm_statistics.h> |
44 | | #endif |
45 | | #elif defined(__FreeBSD__) || defined(__DragonFly__) |
46 | | #include <sys/param.h> |
47 | | #if __FreeBSD_version >= 1200000 |
48 | | #include <sys/cpuset.h> |
49 | | #include <sys/domainset.h> |
50 | | #endif |
51 | | #include <sys/sysctl.h> |
52 | | #endif |
53 | | |
54 | | #if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(_AIX) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__sun) && !defined(__NetBSD__) |
55 | | #define MI_HAS_SYSCALL_H |
56 | | #include <sys/syscall.h> |
57 | | #endif |
58 | | |
59 | | //------------------------------------------------------------------------------------ |
60 | | // Use syscalls for some primitives to allow for libraries that override open/read/close etc. |
61 | | // and do allocation themselves; using syscalls prevents recursion when mimalloc is |
62 | | // still initializing (issue #713) |
63 | | //------------------------------------------------------------------------------------ |
64 | | |
65 | | #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access) |
66 | | |
67 | 16 | static int mi_prim_open(const char* fpath, int open_flags) { |
68 | 16 | return syscall(SYS_open,fpath,open_flags,0); |
69 | 16 | } |
70 | 16 | static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { |
71 | 16 | return syscall(SYS_read,fd,buf,bufsize); |
72 | 16 | } |
73 | 16 | static int mi_prim_close(int fd) { |
74 | 16 | return syscall(SYS_close,fd); |
75 | 16 | } |
76 | 0 | static int mi_prim_access(const char *fpath, int mode) { |
77 | 0 | return syscall(SYS_access,fpath,mode); |
78 | 0 | } |
79 | | |
80 | | #elif !defined(__APPLE__) && !defined(_AIX) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__sun) && !defined(__NetBSD__) // avoid unused warnings |
81 | | |
82 | | static int mi_prim_open(const char* fpath, int open_flags) { |
83 | | return open(fpath,open_flags); |
84 | | } |
85 | | static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { |
86 | | return read(fd,buf,bufsize); |
87 | | } |
88 | | static int mi_prim_close(int fd) { |
89 | | return close(fd); |
90 | | } |
91 | | static int mi_prim_access(const char *fpath, int mode) { |
92 | | return access(fpath,mode); |
93 | | } |
94 | | |
95 | | #endif |
96 | | |
97 | | |
98 | | |
99 | | //--------------------------------------------- |
100 | | // init |
101 | | //--------------------------------------------- |
102 | | |
103 | 16 | static bool unix_detect_overcommit(void) { |
104 | 16 | bool os_overcommit = true; |
105 | 16 | #if defined(__linux__) |
106 | 16 | int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY); |
107 | 16 | if (fd >= 0) { |
108 | 16 | char buf[32] = {0}; |
109 | 16 | ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf)); |
110 | 16 | mi_prim_close(fd); |
111 | | // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting> |
112 | | // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE) |
113 | 16 | if (nread >= 1) { |
114 | 16 | os_overcommit = (buf[0] == '0' || buf[0] == '1'); |
115 | 16 | } |
116 | 16 | } |
117 | | #elif defined(__FreeBSD__) |
118 | | int val = 0; |
119 | | size_t olen = sizeof(val); |
120 | | if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) { |
121 | | os_overcommit = (val != 0); |
122 | | } |
123 | | #else |
124 | | // default: overcommit is true |
125 | | #endif |
126 | 16 | return os_overcommit; |
127 | 16 | } |
128 | | |
129 | 16 | void _mi_prim_mem_init( mi_os_mem_config_t* config ) { |
130 | 16 | long psize = sysconf(_SC_PAGESIZE); |
131 | 16 | if (psize > 0) { |
132 | 16 | config->page_size = (size_t)psize; |
133 | 16 | config->alloc_granularity = (size_t)psize; |
134 | 16 | } |
135 | 16 | config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this? |
136 | 16 | config->has_overcommit = unix_detect_overcommit(); |
137 | 16 | config->must_free_whole = false; // mmap can free in parts |
138 | 16 | config->has_virtual_reserve = true; // todo: check if this true for NetBSD? (for anonymous mmap with PROT_NONE) |
139 | 16 | } |
140 | | |
141 | | |
142 | | //--------------------------------------------- |
143 | | // free |
144 | | //--------------------------------------------- |
145 | | |
146 | 0 | int _mi_prim_free(void* addr, size_t size ) { |
147 | 0 | bool err = (munmap(addr, size) == -1); |
148 | 0 | return (err ? errno : 0); |
149 | 0 | } |
150 | | |
151 | | |
152 | | //--------------------------------------------- |
153 | | // mmap |
154 | | //--------------------------------------------- |
155 | | |
156 | 0 | static int unix_madvise(void* addr, size_t size, int advice) { |
157 | | #if defined(__sun) |
158 | | return madvise((caddr_t)addr, size, advice); // Solaris needs cast (issue #520) |
159 | | #else |
160 | 0 | return madvise(addr, size, advice); |
161 | 0 | #endif |
162 | 0 | } |
163 | | |
164 | 0 | static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) { |
165 | 0 | MI_UNUSED(try_alignment); |
166 | 0 | void* p = NULL; |
167 | | #if defined(MAP_ALIGNED) // BSD |
168 | | if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { |
169 | | size_t n = mi_bsr(try_alignment); |
170 | | if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB |
171 | | p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); |
172 | | if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { |
173 | | int err = errno; |
174 | | _mi_verbose_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr); |
175 | | } |
176 | | if (p!=MAP_FAILED) return p; |
177 | | // fall back to regular mmap |
178 | | } |
179 | | } |
180 | | #elif defined(MAP_ALIGN) // Solaris |
181 | | if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { |
182 | | p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0); // addr parameter is the required alignment |
183 | | if (p!=MAP_FAILED) return p; |
184 | | // fall back to regular mmap |
185 | | } |
186 | | #endif |
187 | 0 | #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED) |
188 | | // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations |
189 | 0 | if (addr == NULL) { |
190 | 0 | void* hint = _mi_os_get_aligned_hint(try_alignment, size); |
191 | 0 | if (hint != NULL) { |
192 | 0 | p = mmap(hint, size, protect_flags, flags, fd, 0); |
193 | 0 | if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { |
194 | | #if MI_TRACK_ENABLED // asan sometimes does not instrument errno correctly? |
195 | | int err = 0; |
196 | | #else |
197 | 0 | int err = errno; |
198 | 0 | #endif |
199 | 0 | _mi_verbose_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint); |
200 | 0 | } |
201 | 0 | if (p!=MAP_FAILED) return p; |
202 | | // fall back to regular mmap |
203 | 0 | } |
204 | 0 | } |
205 | 0 | #endif |
206 | | // regular mmap |
207 | 0 | p = mmap(addr, size, protect_flags, flags, fd, 0); |
208 | 0 | if (p!=MAP_FAILED) return p; |
209 | | // failed to allocate |
210 | 0 | return NULL; |
211 | 0 | } |
212 | | |
213 | 0 | static int unix_mmap_fd(void) { |
214 | | #if defined(VM_MAKE_TAG) |
215 | | // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99) |
216 | | int os_tag = (int)mi_option_get(mi_option_os_tag); |
217 | | if (os_tag < 100 || os_tag > 255) { os_tag = 100; } |
218 | | return VM_MAKE_TAG(os_tag); |
219 | | #else |
220 | 0 | return -1; |
221 | 0 | #endif |
222 | 0 | } |
223 | | |
224 | 0 | static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) { |
225 | | #if !defined(MAP_ANONYMOUS) |
226 | | #define MAP_ANONYMOUS MAP_ANON |
227 | | #endif |
228 | | #if !defined(MAP_NORESERVE) |
229 | | #define MAP_NORESERVE 0 |
230 | | #endif |
231 | 0 | void* p = NULL; |
232 | 0 | const int fd = unix_mmap_fd(); |
233 | 0 | int flags = MAP_PRIVATE | MAP_ANONYMOUS; |
234 | 0 | if (_mi_os_has_overcommit()) { |
235 | 0 | flags |= MAP_NORESERVE; |
236 | 0 | } |
237 | | #if defined(PROT_MAX) |
238 | | protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD |
239 | | #endif |
240 | | // huge page allocation |
241 | 0 | if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) { |
242 | 0 | static _Atomic(size_t) large_page_try_ok; // = 0; |
243 | 0 | size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); |
244 | 0 | if (!large_only && try_ok > 0) { |
245 | | // If the OS is not configured for large OS pages, or the user does not have |
246 | | // enough permission, the `mmap` will always fail (but it might also fail for other reasons). |
247 | | // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times |
248 | | // to avoid too many failing calls to mmap. |
249 | 0 | mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); |
250 | 0 | } |
251 | 0 | else { |
252 | 0 | int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux |
253 | 0 | int lfd = fd; |
254 | | #ifdef MAP_ALIGNED_SUPER |
255 | | lflags |= MAP_ALIGNED_SUPER; |
256 | | #endif |
257 | 0 | #ifdef MAP_HUGETLB |
258 | 0 | lflags |= MAP_HUGETLB; |
259 | 0 | #endif |
260 | 0 | #ifdef MAP_HUGE_1GB |
261 | 0 | static bool mi_huge_pages_available = true; |
262 | 0 | if ((size % MI_GiB) == 0 && mi_huge_pages_available) { |
263 | 0 | lflags |= MAP_HUGE_1GB; |
264 | 0 | } |
265 | 0 | else |
266 | 0 | #endif |
267 | 0 | { |
268 | 0 | #ifdef MAP_HUGE_2MB |
269 | 0 | lflags |= MAP_HUGE_2MB; |
270 | 0 | #endif |
271 | 0 | } |
272 | | #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB |
273 | | lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; |
274 | | #endif |
275 | 0 | if (large_only || lflags != flags) { |
276 | | // try large OS page allocation |
277 | 0 | *is_large = true; |
278 | 0 | p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); |
279 | 0 | #ifdef MAP_HUGE_1GB |
280 | 0 | if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) { |
281 | 0 | mi_huge_pages_available = false; // don't try huge 1GiB pages again |
282 | 0 | _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno); |
283 | 0 | lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB); |
284 | 0 | p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); |
285 | 0 | } |
286 | 0 | #endif |
287 | 0 | if (large_only) return p; |
288 | 0 | if (p == NULL) { |
289 | 0 | mi_atomic_store_release(&large_page_try_ok, (size_t)8); // on error, don't try again for the next N allocations |
290 | 0 | } |
291 | 0 | } |
292 | 0 | } |
293 | 0 | } |
294 | | // regular allocation |
295 | 0 | if (p == NULL) { |
296 | 0 | *is_large = false; |
297 | 0 | p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd); |
298 | 0 | if (p != NULL) { |
299 | 0 | #if defined(MADV_HUGEPAGE) |
300 | | // Many Linux systems don't allow MAP_HUGETLB but they support instead |
301 | | // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE |
302 | | // though since properly aligned allocations will already use large pages if available |
303 | | // in that case -- in particular for our large regions (in `memory.c`). |
304 | | // However, some systems only allow THP if called with explicit `madvise`, so |
305 | | // when large OS pages are enabled for mimalloc, we call `madvise` anyways. |
306 | 0 | if (allow_large && _mi_os_use_large_page(size, try_alignment)) { |
307 | 0 | if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) { |
308 | 0 | *is_large = true; // possibly |
309 | 0 | }; |
310 | 0 | } |
311 | | #elif defined(__sun) |
312 | | if (allow_large && _mi_os_use_large_page(size, try_alignment)) { |
313 | | struct memcntl_mha cmd = {0}; |
314 | | cmd.mha_pagesize = 2*MI_MiB; |
315 | | cmd.mha_cmd = MHA_MAPSIZE_VA; |
316 | | if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) { |
317 | | *is_large = true; |
318 | | } |
319 | | } |
320 | | #endif |
321 | 0 | } |
322 | 0 | } |
323 | 0 | return p; |
324 | 0 | } |
325 | | |
326 | | // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. |
327 | 0 | int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { |
328 | 0 | mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); |
329 | 0 | mi_assert_internal(commit || !allow_large); |
330 | 0 | mi_assert_internal(try_alignment > 0); |
331 | |
|
332 | 0 | *is_zero = true; |
333 | 0 | int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); |
334 | 0 | *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large); |
335 | 0 | return (*addr != NULL ? 0 : errno); |
336 | 0 | } |
337 | | |
338 | | |
339 | | //--------------------------------------------- |
340 | | // Commit/Reset |
341 | | //--------------------------------------------- |
342 | | |
343 | 0 | static void unix_mprotect_hint(int err) { |
344 | | #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page |
345 | | if (err == ENOMEM) { |
346 | | _mi_warning_message("The next warning may be caused by a low memory map limit.\n" |
347 | | " On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n" |
348 | | " For example: sudo sysctl -w vm.max_map_count=262144\n"); |
349 | | } |
350 | | #else |
351 | 0 | MI_UNUSED(err); |
352 | 0 | #endif |
353 | 0 | } |
354 | | |
355 | 0 | int _mi_prim_commit(void* start, size_t size, bool* is_zero) { |
356 | | // commit: ensure we can access the area |
357 | | // note: we may think that *is_zero can be true since the memory |
358 | | // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but |
359 | | // we sometimes call commit on a range with still partially committed |
360 | | // memory and `mprotect` does not zero the range. |
361 | 0 | *is_zero = false; |
362 | 0 | int err = mprotect(start, size, (PROT_READ | PROT_WRITE)); |
363 | 0 | if (err != 0) { |
364 | 0 | err = errno; |
365 | 0 | unix_mprotect_hint(err); |
366 | 0 | } |
367 | 0 | return err; |
368 | 0 | } |
369 | | |
370 | 0 | int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { |
371 | 0 | int err = 0; |
372 | | // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) |
373 | 0 | err = unix_madvise(start, size, MADV_DONTNEED); |
374 | 0 | #if !MI_DEBUG && !MI_SECURE |
375 | 0 | *needs_recommit = false; |
376 | | #else |
377 | | *needs_recommit = true; |
378 | | mprotect(start, size, PROT_NONE); |
379 | | #endif |
380 | | /* |
381 | | // decommit: use mmap with MAP_FIXED and PROT_NONE to discard the existing memory (and reduce rss) |
382 | | *needs_recommit = true; |
383 | | const int fd = unix_mmap_fd(); |
384 | | void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0); |
385 | | if (p != start) { err = errno; } |
386 | | */ |
387 | 0 | return err; |
388 | 0 | } |
389 | | |
390 | 0 | int _mi_prim_reset(void* start, size_t size) { |
391 | | // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it |
392 | | // will not reduce the `rss` stats in tools like `top` even though the memory is available |
393 | | // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by |
394 | | // default `MADV_DONTNEED` is used though. |
395 | 0 | #if defined(MADV_FREE) |
396 | 0 | static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE); |
397 | 0 | int oadvice = (int)mi_atomic_load_relaxed(&advice); |
398 | 0 | int err; |
399 | 0 | while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0; }; |
400 | 0 | if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) { |
401 | | // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on |
402 | 0 | mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); |
403 | 0 | err = unix_madvise(start, size, MADV_DONTNEED); |
404 | 0 | } |
405 | | #else |
406 | | int err = unix_madvise(start, size, MADV_DONTNEED); |
407 | | #endif |
408 | 0 | return err; |
409 | 0 | } |
410 | | |
411 | 0 | int _mi_prim_protect(void* start, size_t size, bool protect) { |
412 | 0 | int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE)); |
413 | 0 | if (err != 0) { err = errno; } |
414 | 0 | unix_mprotect_hint(err); |
415 | 0 | return err; |
416 | 0 | } |
417 | | |
418 | | |
419 | | |
420 | | //--------------------------------------------- |
421 | | // Huge page allocation |
422 | | //--------------------------------------------- |
423 | | |
424 | | #if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) && !defined(__CYGWIN__) |
425 | | |
426 | | #ifndef MPOL_PREFERRED |
427 | 0 | #define MPOL_PREFERRED 1 |
428 | | #endif |
429 | | |
430 | | #if defined(MI_HAS_SYSCALL_H) && defined(SYS_mbind) |
431 | 0 | static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { |
432 | 0 | return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags); |
433 | 0 | } |
434 | | #else |
435 | | static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { |
436 | | MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags); |
437 | | return 0; |
438 | | } |
439 | | #endif |
440 | | |
441 | 0 | int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { |
442 | 0 | bool is_large = true; |
443 | 0 | *is_zero = true; |
444 | 0 | *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); |
445 | 0 | if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes |
446 | 0 | unsigned long numa_mask = (1UL << numa_node); |
447 | | // TODO: does `mbind` work correctly for huge OS pages? should we |
448 | | // use `set_mempolicy` before calling mmap instead? |
449 | | // see: <https://lkml.org/lkml/2017/2/9/875> |
450 | 0 | long err = mi_prim_mbind(*addr, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); |
451 | 0 | if (err != 0) { |
452 | 0 | err = errno; |
453 | 0 | _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err); |
454 | 0 | } |
455 | 0 | } |
456 | 0 | return (*addr != NULL ? 0 : errno); |
457 | 0 | } |
458 | | |
459 | | #else |
460 | | |
461 | | int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { |
462 | | MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node); |
463 | | *is_zero = false; |
464 | | *addr = NULL; |
465 | | return ENOMEM; |
466 | | } |
467 | | |
468 | | #endif |
469 | | |
470 | | //--------------------------------------------- |
471 | | // NUMA nodes |
472 | | //--------------------------------------------- |
473 | | |
474 | | #if defined(__linux__) |
475 | | |
476 | | #include <stdio.h> // snprintf |
477 | | |
478 | 0 | size_t _mi_prim_numa_node(void) { |
479 | 0 | #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu) |
480 | 0 | unsigned long node = 0; |
481 | 0 | unsigned long ncpu = 0; |
482 | 0 | long err = syscall(SYS_getcpu, &ncpu, &node, NULL); |
483 | 0 | if (err != 0) return 0; |
484 | 0 | return node; |
485 | | #else |
486 | | return 0; |
487 | | #endif |
488 | 0 | } |
489 | | |
490 | 0 | size_t _mi_prim_numa_node_count(void) { |
491 | 0 | char buf[128]; |
492 | 0 | unsigned node = 0; |
493 | 0 | for(node = 0; node < 256; node++) { |
494 | | // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation) |
495 | 0 | snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); |
496 | 0 | if (mi_prim_access(buf,R_OK) != 0) break; |
497 | 0 | } |
498 | 0 | return (node+1); |
499 | 0 | } |
500 | | |
501 | | #elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000 |
502 | | |
503 | | size_t _mi_prim_numa_node(void) { |
504 | | domainset_t dom; |
505 | | size_t node; |
506 | | int policy; |
507 | | if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul; |
508 | | for (node = 0; node < MAXMEMDOM; node++) { |
509 | | if (DOMAINSET_ISSET(node, &dom)) return node; |
510 | | } |
511 | | return 0ul; |
512 | | } |
513 | | |
514 | | size_t _mi_prim_numa_node_count(void) { |
515 | | size_t ndomains = 0; |
516 | | size_t len = sizeof(ndomains); |
517 | | if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul; |
518 | | return ndomains; |
519 | | } |
520 | | |
521 | | #elif defined(__DragonFly__) |
522 | | |
523 | | size_t _mi_prim_numa_node(void) { |
524 | | // TODO: DragonFly does not seem to provide any userland means to get this information. |
525 | | return 0ul; |
526 | | } |
527 | | |
528 | | size_t _mi_prim_numa_node_count(void) { |
529 | | size_t ncpus = 0, nvirtcoresperphys = 0; |
530 | | size_t len = sizeof(size_t); |
531 | | if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul; |
532 | | if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul; |
533 | | return nvirtcoresperphys * ncpus; |
534 | | } |
535 | | |
536 | | #else |
537 | | |
538 | | size_t _mi_prim_numa_node(void) { |
539 | | return 0; |
540 | | } |
541 | | |
542 | | size_t _mi_prim_numa_node_count(void) { |
543 | | return 1; |
544 | | } |
545 | | |
546 | | #endif |
547 | | |
548 | | // ---------------------------------------------------------------- |
549 | | // Clock |
550 | | // ---------------------------------------------------------------- |
551 | | |
552 | | #include <time.h> |
553 | | |
554 | | #if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) |
555 | | |
556 | 48 | mi_msecs_t _mi_prim_clock_now(void) { |
557 | 48 | struct timespec t; |
558 | 48 | #ifdef CLOCK_MONOTONIC |
559 | 48 | clock_gettime(CLOCK_MONOTONIC, &t); |
560 | | #else |
561 | | clock_gettime(CLOCK_REALTIME, &t); |
562 | | #endif |
563 | 48 | return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); |
564 | 48 | } |
565 | | |
566 | | #else |
567 | | |
568 | | // low resolution timer |
569 | | mi_msecs_t _mi_prim_clock_now(void) { |
570 | | #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) |
571 | | return (mi_msecs_t)clock(); |
572 | | #elif (CLOCKS_PER_SEC < 1000) |
573 | | return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); |
574 | | #else |
575 | | return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); |
576 | | #endif |
577 | | } |
578 | | |
579 | | #endif |
580 | | |
581 | | |
582 | | |
583 | | |
584 | | //---------------------------------------------------------------- |
585 | | // Process info |
586 | | //---------------------------------------------------------------- |
587 | | |
588 | | #if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__) |
589 | | #include <stdio.h> |
590 | | #include <unistd.h> |
591 | | #include <sys/resource.h> |
592 | | |
593 | | #if defined(__APPLE__) |
594 | | #include <mach/mach.h> |
595 | | #endif |
596 | | |
597 | | #if defined(__HAIKU__) |
598 | | #include <kernel/OS.h> |
599 | | #endif |
600 | | |
601 | 0 | static mi_msecs_t timeval_secs(const struct timeval* tv) { |
602 | 0 | return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L); |
603 | 0 | } |
604 | | |
605 | | void _mi_prim_process_info(mi_process_info_t* pinfo) |
606 | 0 | { |
607 | 0 | struct rusage rusage; |
608 | 0 | getrusage(RUSAGE_SELF, &rusage); |
609 | 0 | pinfo->utime = timeval_secs(&rusage.ru_utime); |
610 | 0 | pinfo->stime = timeval_secs(&rusage.ru_stime); |
611 | 0 | #if !defined(__HAIKU__) |
612 | 0 | pinfo->page_faults = rusage.ru_majflt; |
613 | 0 | #endif |
614 | | #if defined(__HAIKU__) |
615 | | // Haiku does not have (yet?) a way to |
616 | | // get these stats per process |
617 | | thread_info tid; |
618 | | area_info mem; |
619 | | ssize_t c; |
620 | | get_thread_info(find_thread(0), &tid); |
621 | | while (get_next_area_info(tid.team, &c, &mem) == B_OK) { |
622 | | pinfo->peak_rss += mem.ram_size; |
623 | | } |
624 | | pinfo->page_faults = 0; |
625 | | #elif defined(__APPLE__) |
626 | | pinfo->peak_rss = rusage.ru_maxrss; // macos reports in bytes |
627 | | #ifdef MACH_TASK_BASIC_INFO |
628 | | struct mach_task_basic_info info; |
629 | | mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; |
630 | | if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { |
631 | | pinfo->current_rss = (size_t)info.resident_size; |
632 | | } |
633 | | #else |
634 | | struct task_basic_info info; |
635 | | mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT; |
636 | | if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { |
637 | | pinfo->current_rss = (size_t)info.resident_size; |
638 | | } |
639 | | #endif |
640 | | #else |
641 | 0 | pinfo->peak_rss = rusage.ru_maxrss * 1024; // Linux/BSD report in KiB |
642 | 0 | #endif |
643 | | // use defaults for commit |
644 | 0 | } |
645 | | |
646 | | #else |
647 | | |
648 | | #ifndef __wasi__ |
649 | | // WebAssembly instances are not processes |
650 | | #pragma message("define a way to get process info") |
651 | | #endif |
652 | | |
653 | | void _mi_prim_process_info(mi_process_info_t* pinfo) |
654 | | { |
655 | | // use defaults |
656 | | MI_UNUSED(pinfo); |
657 | | } |
658 | | |
659 | | #endif |
660 | | |
661 | | |
662 | | //---------------------------------------------------------------- |
663 | | // Output |
664 | | //---------------------------------------------------------------- |
665 | | |
666 | 0 | void _mi_prim_out_stderr( const char* msg ) { |
667 | 0 | fputs(msg,stderr); |
668 | 0 | } |
669 | | |
670 | | |
671 | | //---------------------------------------------------------------- |
672 | | // Environment |
673 | | //---------------------------------------------------------------- |
674 | | |
675 | | #if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0) |
676 | | // On Posix systemsr use `environ` to access environment variables |
677 | | // even before the C runtime is initialized. |
678 | | #if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>) |
679 | | #include <crt_externs.h> |
680 | | static char** mi_get_environ(void) { |
681 | | return (*_NSGetEnviron()); |
682 | | } |
683 | | #else |
684 | | extern char** environ; |
685 | 512 | static char** mi_get_environ(void) { |
686 | 512 | return environ; |
687 | 512 | } |
688 | | #endif |
689 | 512 | bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { |
690 | 512 | if (name==NULL) return false; |
691 | 512 | const size_t len = _mi_strlen(name); |
692 | 512 | if (len == 0) return false; |
693 | 512 | char** env = mi_get_environ(); |
694 | 512 | if (env == NULL) return false; |
695 | | // compare up to 10000 entries |
696 | 18.4k | for (int i = 0; i < 10000 && env[i] != NULL; i++) { |
697 | 17.9k | const char* s = env[i]; |
698 | 17.9k | if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive |
699 | | // found it |
700 | 0 | _mi_strlcpy(result, s + len + 1, result_size); |
701 | 0 | return true; |
702 | 0 | } |
703 | 17.9k | } |
704 | 512 | return false; |
705 | 512 | } |
706 | | #else |
707 | | // fallback: use standard C `getenv` but this cannot be used while initializing the C runtime |
708 | | bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { |
709 | | // cannot call getenv() when still initializing the C runtime. |
710 | | if (_mi_preloading()) return false; |
711 | | const char* s = getenv(name); |
712 | | if (s == NULL) { |
713 | | // we check the upper case name too. |
714 | | char buf[64+1]; |
715 | | size_t len = _mi_strnlen(name,sizeof(buf)-1); |
716 | | for (size_t i = 0; i < len; i++) { |
717 | | buf[i] = _mi_toupper(name[i]); |
718 | | } |
719 | | buf[len] = 0; |
720 | | s = getenv(buf); |
721 | | } |
722 | | if (s == NULL || _mi_strnlen(s,result_size) >= result_size) return false; |
723 | | _mi_strlcpy(result, s, result_size); |
724 | | return true; |
725 | | } |
726 | | #endif // !MI_USE_ENVIRON |
727 | | |
728 | | |
729 | | //---------------------------------------------------------------- |
730 | | // Random |
731 | | //---------------------------------------------------------------- |
732 | | |
733 | | #if defined(__APPLE__) |
734 | | |
735 | | #include <AvailabilityMacros.h> |
736 | | #if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 |
737 | | #include <CommonCrypto/CommonCryptoError.h> |
738 | | #include <CommonCrypto/CommonRandom.h> |
739 | | #endif |
740 | | bool _mi_prim_random_buf(void* buf, size_t buf_len) { |
741 | | #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 |
742 | | // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf |
743 | | // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html> |
744 | | return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); |
745 | | #else |
746 | | // fall back on older macOS |
747 | | arc4random_buf(buf, buf_len); |
748 | | return true; |
749 | | #endif |
750 | | } |
751 | | |
752 | | #elif defined(__ANDROID__) || defined(__DragonFly__) || \ |
753 | | defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ |
754 | | defined(__sun) |
755 | | |
756 | | #include <stdlib.h> |
757 | | bool _mi_prim_random_buf(void* buf, size_t buf_len) { |
758 | | arc4random_buf(buf, buf_len); |
759 | | return true; |
760 | | } |
761 | | |
762 | | #elif defined(__linux__) || defined(__HAIKU__) |
763 | | |
764 | | #include <sys/types.h> |
765 | | #include <sys/stat.h> |
766 | | #include <fcntl.h> |
767 | | #include <errno.h> |
768 | | |
769 | 16 | bool _mi_prim_random_buf(void* buf, size_t buf_len) { |
770 | | // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h` |
771 | | // and for the latter the actual `getrandom` call is not always defined. |
772 | | // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>) |
773 | | // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed. |
774 | 16 | #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getrandom) |
775 | 16 | #ifndef GRND_NONBLOCK |
776 | 16 | #define GRND_NONBLOCK (1) |
777 | 16 | #endif |
778 | 16 | static _Atomic(uintptr_t) no_getrandom; // = 0 |
779 | 16 | if (mi_atomic_load_acquire(&no_getrandom)==0) { |
780 | 16 | ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK); |
781 | 16 | if (ret >= 0) return (buf_len == (size_t)ret); |
782 | 0 | if (errno != ENOSYS) return false; |
783 | 0 | mi_atomic_store_release(&no_getrandom, (uintptr_t)1); // don't call again, and fall back to /dev/urandom |
784 | 0 | } |
785 | 0 | #endif |
786 | 0 | int flags = O_RDONLY; |
787 | 0 | #if defined(O_CLOEXEC) |
788 | 0 | flags |= O_CLOEXEC; |
789 | 0 | #endif |
790 | 0 | int fd = mi_prim_open("/dev/urandom", flags); |
791 | 0 | if (fd < 0) return false; |
792 | 0 | size_t count = 0; |
793 | 0 | while(count < buf_len) { |
794 | 0 | ssize_t ret = mi_prim_read(fd, (char*)buf + count, buf_len - count); |
795 | 0 | if (ret<=0) { |
796 | 0 | if (errno!=EAGAIN && errno!=EINTR) break; |
797 | 0 | } |
798 | 0 | else { |
799 | 0 | count += ret; |
800 | 0 | } |
801 | 0 | } |
802 | 0 | mi_prim_close(fd); |
803 | 0 | return (count==buf_len); |
804 | 0 | } |
805 | | |
806 | | #else |
807 | | |
808 | | bool _mi_prim_random_buf(void* buf, size_t buf_len) { |
809 | | return false; |
810 | | } |
811 | | |
812 | | #endif |
813 | | |
814 | | |
815 | | //---------------------------------------------------------------- |
816 | | // Thread init/done |
817 | | //---------------------------------------------------------------- |
818 | | |
819 | | #if defined(MI_USE_PTHREADS) |
820 | | |
821 | | // use pthread local storage keys to detect thread ending |
822 | | // (and used with MI_TLS_PTHREADS for the default heap) |
823 | | pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); |
824 | | |
825 | 0 | static void mi_pthread_done(void* value) { |
826 | 0 | if (value!=NULL) { |
827 | 0 | _mi_thread_done((mi_heap_t*)value); |
828 | 0 | } |
829 | 0 | } |
830 | | |
831 | 16 | void _mi_prim_thread_init_auto_done(void) { |
832 | 16 | mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); |
833 | 16 | pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); |
834 | 16 | } |
835 | | |
836 | 0 | void _mi_prim_thread_done_auto_done(void) { |
837 | | // nothing to do |
838 | 0 | } |
839 | | |
840 | 16 | void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { |
841 | 16 | if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD |
842 | 16 | pthread_setspecific(_mi_heap_default_key, heap); |
843 | 16 | } |
844 | 16 | } |
845 | | |
846 | | #else |
847 | | |
848 | | void _mi_prim_thread_init_auto_done(void) { |
849 | | // nothing |
850 | | } |
851 | | |
852 | | void _mi_prim_thread_done_auto_done(void) { |
853 | | // nothing |
854 | | } |
855 | | |
856 | | void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { |
857 | | MI_UNUSED(heap); |
858 | | } |
859 | | |
860 | | #endif |