/src/cpython/Objects/mimalloc/os.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ---------------------------------------------------------------------------- |
2 | | Copyright (c) 2018-2023, Microsoft Research, Daan Leijen |
3 | | This is free software; you can redistribute it and/or modify it under the |
4 | | terms of the MIT license. A copy of the license can be found in the file |
5 | | "LICENSE" at the root of this distribution. |
6 | | -----------------------------------------------------------------------------*/ |
7 | | #include "mimalloc.h" |
8 | | #include "mimalloc/internal.h" |
9 | | #include "mimalloc/atomic.h" |
10 | | #include "mimalloc/prim.h" |
11 | | |
12 | | |
13 | | /* ----------------------------------------------------------- |
14 | | Initialization. |
15 | | On windows initializes support for aligned allocation and |
16 | | large OS pages (if MIMALLOC_LARGE_OS_PAGES is true). |
17 | | ----------------------------------------------------------- */ |
18 | | |
19 | | static mi_os_mem_config_t mi_os_mem_config = { |
20 | | 4096, // page size |
21 | | 0, // large page size (usually 2MiB) |
22 | | 4096, // allocation granularity |
23 | | true, // has overcommit? (if true we use MAP_NORESERVE on mmap systems) |
24 | | false, // must free whole? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span) |
25 | | true // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory) |
26 | | }; |
27 | | |
28 | 0 | bool _mi_os_has_overcommit(void) { |
29 | 0 | return mi_os_mem_config.has_overcommit; |
30 | 0 | } |
31 | | |
32 | 0 | bool _mi_os_has_virtual_reserve(void) { |
33 | 0 | return mi_os_mem_config.has_virtual_reserve; |
34 | 0 | } |
35 | | |
36 | | |
37 | | // OS (small) page size |
38 | 0 | size_t _mi_os_page_size(void) { |
39 | 0 | return mi_os_mem_config.page_size; |
40 | 0 | } |
41 | | |
42 | | // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB) |
43 | 0 | size_t _mi_os_large_page_size(void) { |
44 | 0 | return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size()); |
45 | 0 | } |
46 | | |
47 | 0 | bool _mi_os_use_large_page(size_t size, size_t alignment) { |
48 | | // if we have access, check the size and alignment requirements |
49 | 0 | if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false; |
50 | 0 | return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0); |
51 | 0 | } |
52 | | |
53 | | // round to a good OS allocation size (bounded by max 12.5% waste) |
54 | 0 | size_t _mi_os_good_alloc_size(size_t size) { |
55 | 0 | size_t align_size; |
56 | 0 | if (size < 512*MI_KiB) align_size = _mi_os_page_size(); |
57 | 0 | else if (size < 2*MI_MiB) align_size = 64*MI_KiB; |
58 | 0 | else if (size < 8*MI_MiB) align_size = 256*MI_KiB; |
59 | 0 | else if (size < 32*MI_MiB) align_size = 1*MI_MiB; |
60 | 0 | else align_size = 4*MI_MiB; |
61 | 0 | if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow? |
62 | 0 | return _mi_align_up(size, align_size); |
63 | 0 | } |
64 | | |
65 | 16 | void _mi_os_init(void) { |
66 | 16 | _mi_prim_mem_init(&mi_os_mem_config); |
67 | 16 | } |
68 | | |
69 | | |
70 | | /* ----------------------------------------------------------- |
71 | | Util |
72 | | -------------------------------------------------------------- */ |
73 | | bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); |
74 | | bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); |
75 | | |
76 | 0 | static void* mi_align_up_ptr(void* p, size_t alignment) { |
77 | 0 | return (void*)_mi_align_up((uintptr_t)p, alignment); |
78 | 0 | } |
79 | | |
80 | 0 | static void* mi_align_down_ptr(void* p, size_t alignment) { |
81 | 0 | return (void*)_mi_align_down((uintptr_t)p, alignment); |
82 | 0 | } |
83 | | |
84 | | |
85 | | /* ----------------------------------------------------------- |
86 | | aligned hinting |
87 | | -------------------------------------------------------------- */ |
88 | | |
89 | | // On 64-bit systems, we can do efficient aligned allocation by using |
90 | | // the 2TiB to 30TiB area to allocate those. |
91 | | #if (MI_INTPTR_SIZE >= 8) |
92 | | static mi_decl_cache_align _Atomic(uintptr_t)aligned_base; |
93 | | |
94 | | // Return a MI_SEGMENT_SIZE aligned address that is probably available. |
95 | | // If this returns NULL, the OS will determine the address but on some OS's that may not be |
96 | | // properly aligned which can be more costly as it needs to be adjusted afterwards. |
97 | | // For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; |
98 | | // (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses |
99 | | // in the middle of the 2TiB - 6TiB address range (see issue #372)) |
100 | | |
101 | 0 | #define MI_HINT_BASE ((uintptr_t)2 << 40) // 2TiB start |
102 | 0 | #define MI_HINT_AREA ((uintptr_t)4 << 40) // upto 6TiB (since before win8 there is "only" 8TiB available to processes) |
103 | 0 | #define MI_HINT_MAX ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages) |
104 | | |
105 | | void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) |
106 | 0 | { |
107 | 0 | if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL; |
108 | 0 | size = _mi_align_up(size, MI_SEGMENT_SIZE); |
109 | 0 | if (size > 1*MI_GiB) return NULL; // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096. |
110 | | #if (MI_SECURE>0) |
111 | | size += MI_SEGMENT_SIZE; // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas. |
112 | | #endif |
113 | | |
114 | 0 | uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size); |
115 | 0 | if (hint == 0 || hint > MI_HINT_MAX) { // wrap or initialize |
116 | 0 | uintptr_t init = MI_HINT_BASE; |
117 | 0 | #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode |
118 | 0 | mi_heap_t* heap = mi_prim_get_default_heap(); |
119 | | // gh-123022: default heap may not be initialized in CPython in background threads |
120 | 0 | if (mi_heap_is_initialized(heap)) { |
121 | 0 | uintptr_t r = _mi_heap_random_next(heap); |
122 | 0 | init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA); // (randomly 20 bits)*4MiB == 0 to 4TiB |
123 | 0 | } |
124 | 0 | #endif |
125 | 0 | uintptr_t expected = hint + size; |
126 | 0 | mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init); |
127 | 0 | hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all |
128 | 0 | } |
129 | 0 | if (hint%try_alignment != 0) return NULL; |
130 | 0 | return (void*)hint; |
131 | 0 | } |
132 | | #else |
133 | | void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { |
134 | | MI_UNUSED(try_alignment); MI_UNUSED(size); |
135 | | return NULL; |
136 | | } |
137 | | #endif |
138 | | |
139 | | |
140 | | /* ----------------------------------------------------------- |
141 | | Free memory |
142 | | -------------------------------------------------------------- */ |
143 | | |
144 | | static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats); |
145 | | |
146 | 0 | static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) { |
147 | 0 | MI_UNUSED(tld_stats); |
148 | 0 | mi_assert_internal((size % _mi_os_page_size()) == 0); |
149 | 0 | if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr) |
150 | 0 | int err = _mi_prim_free(addr, size); |
151 | 0 | if (err != 0) { |
152 | 0 | _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); |
153 | 0 | } |
154 | 0 | mi_stats_t* stats = &_mi_stats_main; |
155 | 0 | if (still_committed) { _mi_stat_decrease(&stats->committed, size); } |
156 | 0 | _mi_stat_decrease(&stats->reserved, size); |
157 | 0 | } |
158 | | |
159 | 0 | void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) { |
160 | 0 | if (mi_memkind_is_os(memid.memkind)) { |
161 | 0 | size_t csize = _mi_os_good_alloc_size(size); |
162 | 0 | void* base = addr; |
163 | | // different base? (due to alignment) |
164 | 0 | if (memid.mem.os.base != NULL) { |
165 | 0 | mi_assert(memid.mem.os.base <= addr); |
166 | 0 | mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); |
167 | 0 | base = memid.mem.os.base; |
168 | 0 | csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); |
169 | 0 | } |
170 | | // free it |
171 | 0 | if (memid.memkind == MI_MEM_OS_HUGE) { |
172 | 0 | mi_assert(memid.is_pinned); |
173 | 0 | mi_os_free_huge_os_pages(base, csize, tld_stats); |
174 | 0 | } |
175 | 0 | else { |
176 | 0 | mi_os_prim_free(base, csize, still_committed, tld_stats); |
177 | 0 | } |
178 | 0 | } |
179 | 0 | else { |
180 | | // nothing to do |
181 | 0 | mi_assert(memid.memkind < MI_MEM_OS); |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | 0 | void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) { |
186 | 0 | _mi_os_free_ex(p, size, true, memid, tld_stats); |
187 | 0 | } |
188 | | |
189 | | |
190 | | /* ----------------------------------------------------------- |
191 | | Primitive allocation from the OS. |
192 | | -------------------------------------------------------------- */ |
193 | | |
194 | | // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. |
195 | 0 | static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* stats) { |
196 | 0 | mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); |
197 | 0 | mi_assert_internal(is_zero != NULL); |
198 | 0 | mi_assert_internal(is_large != NULL); |
199 | 0 | if (size == 0) return NULL; |
200 | 0 | if (!commit) { allow_large = false; } |
201 | 0 | if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning |
202 | |
|
203 | 0 | *is_zero = false; |
204 | 0 | void* p = NULL; |
205 | 0 | int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p); |
206 | 0 | if (err != 0) { |
207 | 0 | _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large); |
208 | 0 | } |
209 | 0 | mi_stat_counter_increase(stats->mmap_calls, 1); |
210 | 0 | if (p != NULL) { |
211 | 0 | _mi_stat_increase(&stats->reserved, size); |
212 | 0 | if (commit) { |
213 | 0 | _mi_stat_increase(&stats->committed, size); |
214 | | // seems needed for asan (or `mimalloc-test-api` fails) |
215 | | #ifdef MI_TRACK_ASAN |
216 | | if (*is_zero) { mi_track_mem_defined(p,size); } |
217 | | else { mi_track_mem_undefined(p,size); } |
218 | | #endif |
219 | 0 | } |
220 | 0 | } |
221 | 0 | return p; |
222 | 0 | } |
223 | | |
224 | | |
225 | | // Primitive aligned allocation from the OS. |
226 | | // This function guarantees the allocated memory is aligned. |
227 | 0 | static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) { |
228 | 0 | mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0)); |
229 | 0 | mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); |
230 | 0 | mi_assert_internal(is_large != NULL); |
231 | 0 | mi_assert_internal(is_zero != NULL); |
232 | 0 | mi_assert_internal(base != NULL); |
233 | 0 | if (!commit) allow_large = false; |
234 | 0 | if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; |
235 | 0 | size = _mi_align_up(size, _mi_os_page_size()); |
236 | | |
237 | | // try first with a hint (this will be aligned directly on Win 10+ or BSD) |
238 | 0 | void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats); |
239 | 0 | if (p == NULL) return NULL; |
240 | | |
241 | | // aligned already? |
242 | 0 | if (((uintptr_t)p % alignment) == 0) { |
243 | 0 | *base = p; |
244 | 0 | } |
245 | 0 | else { |
246 | | // if not aligned, free it, overallocate, and unmap around it |
247 | | // NOTE(sgross): this warning causes issues in Python tests |
248 | | // _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); |
249 | 0 | mi_os_prim_free(p, size, commit, stats); |
250 | 0 | if (size >= (SIZE_MAX - alignment)) return NULL; // overflow |
251 | 0 | const size_t over_size = size + alignment; |
252 | |
|
253 | 0 | if (mi_os_mem_config.must_free_whole) { // win32 virtualAlloc cannot free parts of an allocate block |
254 | | // over-allocate uncommitted (virtual) memory |
255 | 0 | p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats); |
256 | 0 | if (p == NULL) return NULL; |
257 | | |
258 | | // set p to the aligned part in the full region |
259 | | // note: this is dangerous on Windows as VirtualFree needs the actual base pointer |
260 | | // this is handled though by having the `base` field in the memid's |
261 | 0 | *base = p; // remember the base |
262 | 0 | p = mi_align_up_ptr(p, alignment); |
263 | | |
264 | | // explicitly commit only the aligned part |
265 | 0 | if (commit) { |
266 | 0 | _mi_os_commit(p, size, NULL, stats); |
267 | 0 | } |
268 | 0 | } |
269 | 0 | else { // mmap can free inside an allocation |
270 | | // overallocate... |
271 | 0 | p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats); |
272 | 0 | if (p == NULL) return NULL; |
273 | | |
274 | | // and selectively unmap parts around the over-allocated area. (noop on sbrk) |
275 | 0 | void* aligned_p = mi_align_up_ptr(p, alignment); |
276 | 0 | size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; |
277 | 0 | size_t mid_size = _mi_align_up(size, _mi_os_page_size()); |
278 | 0 | size_t post_size = over_size - pre_size - mid_size; |
279 | 0 | mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size); |
280 | 0 | if (pre_size > 0) { mi_os_prim_free(p, pre_size, commit, stats); } |
281 | 0 | if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); } |
282 | | // we can return the aligned pointer on `mmap` (and sbrk) systems |
283 | 0 | p = aligned_p; |
284 | 0 | *base = aligned_p; // since we freed the pre part, `*base == p`. |
285 | 0 | } |
286 | 0 | } |
287 | | |
288 | 0 | mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0)); |
289 | 0 | return p; |
290 | 0 | } |
291 | | |
292 | | |
293 | | /* ----------------------------------------------------------- |
294 | | OS API: alloc and alloc_aligned |
295 | | ----------------------------------------------------------- */ |
296 | | |
297 | 0 | void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) { |
298 | 0 | MI_UNUSED(tld_stats); |
299 | 0 | *memid = _mi_memid_none(); |
300 | 0 | mi_stats_t* stats = &_mi_stats_main; |
301 | 0 | if (size == 0) return NULL; |
302 | 0 | size = _mi_os_good_alloc_size(size); |
303 | 0 | bool os_is_large = false; |
304 | 0 | bool os_is_zero = false; |
305 | 0 | void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats); |
306 | 0 | if (p != NULL) { |
307 | 0 | *memid = _mi_memid_create_os(true, os_is_zero, os_is_large); |
308 | 0 | } |
309 | 0 | return p; |
310 | 0 | } |
311 | | |
312 | | void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) |
313 | 0 | { |
314 | 0 | MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings |
315 | 0 | MI_UNUSED(tld_stats); |
316 | 0 | *memid = _mi_memid_none(); |
317 | 0 | if (size == 0) return NULL; |
318 | 0 | size = _mi_os_good_alloc_size(size); |
319 | 0 | alignment = _mi_align_up(alignment, _mi_os_page_size()); |
320 | |
|
321 | 0 | bool os_is_large = false; |
322 | 0 | bool os_is_zero = false; |
323 | 0 | void* os_base = NULL; |
324 | 0 | void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, &_mi_stats_main /*tld->stats*/ ); |
325 | 0 | if (p != NULL) { |
326 | 0 | *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large); |
327 | 0 | memid->mem.os.base = os_base; |
328 | 0 | memid->mem.os.alignment = alignment; |
329 | 0 | } |
330 | 0 | return p; |
331 | 0 | } |
332 | | |
333 | | /* ----------------------------------------------------------- |
334 | | OS aligned allocation with an offset. This is used |
335 | | for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc |
336 | | page where the object can be aligned at an offset from the start of the segment. |
337 | | As we may need to overallocate, we need to free such pointers using `mi_free_aligned` |
338 | | to use the actual start of the memory region. |
339 | | ----------------------------------------------------------- */ |
340 | | |
341 | 0 | void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) { |
342 | 0 | mi_assert(offset <= MI_SEGMENT_SIZE); |
343 | 0 | mi_assert(offset <= size); |
344 | 0 | mi_assert((alignment % _mi_os_page_size()) == 0); |
345 | 0 | *memid = _mi_memid_none(); |
346 | 0 | if (offset > MI_SEGMENT_SIZE) return NULL; |
347 | 0 | if (offset == 0) { |
348 | | // regular aligned allocation |
349 | 0 | return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld_stats); |
350 | 0 | } |
351 | 0 | else { |
352 | | // overallocate to align at an offset |
353 | 0 | const size_t extra = _mi_align_up(offset, alignment) - offset; |
354 | 0 | const size_t oversize = size + extra; |
355 | 0 | void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, tld_stats); |
356 | 0 | if (start == NULL) return NULL; |
357 | | |
358 | 0 | void* const p = (uint8_t*)start + extra; |
359 | 0 | mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment)); |
360 | | // decommit the overallocation at the start |
361 | 0 | if (commit && extra > _mi_os_page_size()) { |
362 | 0 | _mi_os_decommit(start, extra, tld_stats); |
363 | 0 | } |
364 | 0 | return p; |
365 | 0 | } |
366 | 0 | } |
367 | | |
368 | | /* ----------------------------------------------------------- |
369 | | OS memory API: reset, commit, decommit, protect, unprotect. |
370 | | ----------------------------------------------------------- */ |
371 | | |
372 | | // OS page align within a given area, either conservative (pages inside the area only), |
373 | | // or not (straddling pages outside the area is possible) |
374 | 0 | static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) { |
375 | 0 | mi_assert(addr != NULL && size > 0); |
376 | 0 | if (newsize != NULL) *newsize = 0; |
377 | 0 | if (size == 0 || addr == NULL) return NULL; |
378 | | |
379 | | // page align conservatively within the range |
380 | 0 | void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size()) |
381 | 0 | : mi_align_down_ptr(addr, _mi_os_page_size())); |
382 | 0 | void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) |
383 | 0 | : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size())); |
384 | 0 | ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start; |
385 | 0 | if (diff <= 0) return NULL; |
386 | | |
387 | 0 | mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size)); |
388 | 0 | if (newsize != NULL) *newsize = (size_t)diff; |
389 | 0 | return start; |
390 | 0 | } |
391 | | |
392 | 0 | static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) { |
393 | 0 | return mi_os_page_align_areax(true, addr, size, newsize); |
394 | 0 | } |
395 | | |
396 | 0 | bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) { |
397 | 0 | MI_UNUSED(tld_stats); |
398 | 0 | mi_stats_t* stats = &_mi_stats_main; |
399 | 0 | if (is_zero != NULL) { *is_zero = false; } |
400 | 0 | _mi_stat_increase(&stats->committed, size); // use size for precise commit vs. decommit |
401 | 0 | _mi_stat_counter_increase(&stats->commit_calls, 1); |
402 | | |
403 | | // page align range |
404 | 0 | size_t csize; |
405 | 0 | void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize); |
406 | 0 | if (csize == 0) return true; |
407 | | |
408 | | // commit |
409 | 0 | bool os_is_zero = false; |
410 | 0 | int err = _mi_prim_commit(start, csize, &os_is_zero); |
411 | 0 | if (err != 0) { |
412 | 0 | _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); |
413 | 0 | return false; |
414 | 0 | } |
415 | 0 | if (os_is_zero && is_zero != NULL) { |
416 | 0 | *is_zero = true; |
417 | 0 | mi_assert_expensive(mi_mem_is_zero(start, csize)); |
418 | 0 | } |
419 | | // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails) |
420 | | #ifdef MI_TRACK_ASAN |
421 | | if (os_is_zero) { mi_track_mem_defined(start,csize); } |
422 | | else { mi_track_mem_undefined(start,csize); } |
423 | | #endif |
424 | 0 | return true; |
425 | 0 | } |
426 | | |
427 | 0 | static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) { |
428 | 0 | MI_UNUSED(tld_stats); |
429 | 0 | mi_stats_t* stats = &_mi_stats_main; |
430 | 0 | mi_assert_internal(needs_recommit!=NULL); |
431 | 0 | _mi_stat_decrease(&stats->committed, size); |
432 | | |
433 | | // page align |
434 | 0 | size_t csize; |
435 | 0 | void* start = mi_os_page_align_area_conservative(addr, size, &csize); |
436 | 0 | if (csize == 0) return true; |
437 | | |
438 | | // decommit |
439 | 0 | *needs_recommit = true; |
440 | 0 | int err = _mi_prim_decommit(start,csize,needs_recommit); |
441 | 0 | if (err != 0) { |
442 | 0 | _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); |
443 | 0 | } |
444 | 0 | mi_assert_internal(err == 0); |
445 | 0 | return (err == 0); |
446 | 0 | } |
447 | | |
448 | 0 | bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) { |
449 | 0 | bool needs_recommit; |
450 | 0 | return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats); |
451 | 0 | } |
452 | | |
453 | | |
454 | | // Signal to the OS that the address range is no longer in use |
455 | | // but may be used later again. This will release physical memory |
456 | | // pages and reduce swapping while keeping the memory committed. |
457 | | // We page align to a conservative area inside the range to reset. |
458 | 0 | bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { |
459 | | // page align conservatively within the range |
460 | 0 | size_t csize; |
461 | 0 | void* start = mi_os_page_align_area_conservative(addr, size, &csize); |
462 | 0 | if (csize == 0) return true; // || _mi_os_is_huge_reserved(addr) |
463 | 0 | _mi_stat_increase(&stats->reset, csize); |
464 | 0 | _mi_stat_counter_increase(&stats->reset_calls, 1); |
465 | |
|
466 | | #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN |
467 | | memset(start, 0, csize); // pretend it is eagerly reset |
468 | | #endif |
469 | |
|
470 | 0 | int err = _mi_prim_reset(start, csize); |
471 | 0 | if (err != 0) { |
472 | 0 | _mi_warning_message("cannot reset OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); |
473 | 0 | } |
474 | 0 | return (err == 0); |
475 | 0 | } |
476 | | |
477 | | |
478 | | // either resets or decommits memory, returns true if the memory needs |
479 | | // to be recommitted if it is to be re-used later on. |
480 | | bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) |
481 | 0 | { |
482 | 0 | if (mi_option_get(mi_option_purge_delay) < 0) return false; // is purging allowed? |
483 | 0 | _mi_stat_counter_increase(&stats->purge_calls, 1); |
484 | 0 | _mi_stat_increase(&stats->purged, size); |
485 | |
|
486 | 0 | if (mi_option_is_enabled(mi_option_purge_decommits) && // should decommit? |
487 | 0 | !_mi_preloading()) // don't decommit during preloading (unsafe) |
488 | 0 | { |
489 | 0 | bool needs_recommit = true; |
490 | 0 | mi_os_decommit_ex(p, size, &needs_recommit, stats); |
491 | 0 | return needs_recommit; |
492 | 0 | } |
493 | 0 | else { |
494 | 0 | if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed |
495 | 0 | _mi_os_reset(p, size, stats); |
496 | 0 | } |
497 | 0 | return false; // needs no recommit |
498 | 0 | } |
499 | 0 | } |
500 | | |
501 | | // either resets or decommits memory, returns true if the memory needs |
502 | | // to be recommitted if it is to be re-used later on. |
503 | 0 | bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) { |
504 | 0 | return _mi_os_purge_ex(p, size, true, stats); |
505 | 0 | } |
506 | | |
507 | | // Protect a region in memory to be not accessible. |
508 | 0 | static bool mi_os_protectx(void* addr, size_t size, bool protect) { |
509 | | // page align conservatively within the range |
510 | 0 | size_t csize = 0; |
511 | 0 | void* start = mi_os_page_align_area_conservative(addr, size, &csize); |
512 | 0 | if (csize == 0) return false; |
513 | | /* |
514 | | if (_mi_os_is_huge_reserved(addr)) { |
515 | | _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n"); |
516 | | } |
517 | | */ |
518 | 0 | int err = _mi_prim_protect(start,csize,protect); |
519 | 0 | if (err != 0) { |
520 | 0 | _mi_warning_message("cannot %s OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", (protect ? "protect" : "unprotect"), err, err, start, csize); |
521 | 0 | } |
522 | 0 | return (err == 0); |
523 | 0 | } |
524 | | |
525 | 0 | bool _mi_os_protect(void* addr, size_t size) { |
526 | 0 | return mi_os_protectx(addr, size, true); |
527 | 0 | } |
528 | | |
529 | 0 | bool _mi_os_unprotect(void* addr, size_t size) { |
530 | 0 | return mi_os_protectx(addr, size, false); |
531 | 0 | } |
532 | | |
533 | | |
534 | | |
535 | | /* ---------------------------------------------------------------------------- |
536 | | Support for allocating huge OS pages (1Gib) that are reserved up-front |
537 | | and possibly associated with a specific NUMA node. (use `numa_node>=0`) |
538 | | -----------------------------------------------------------------------------*/ |
539 | 0 | #define MI_HUGE_OS_PAGE_SIZE (MI_GiB) |
540 | | |
541 | | |
542 | | #if (MI_INTPTR_SIZE >= 8) |
543 | | // To ensure proper alignment, use our own area for huge OS pages |
544 | | static mi_decl_cache_align _Atomic(uintptr_t) mi_huge_start; // = 0 |
545 | | |
546 | | // Claim an aligned address range for huge pages |
547 | 0 | static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { |
548 | 0 | if (total_size != NULL) *total_size = 0; |
549 | 0 | const size_t size = pages * MI_HUGE_OS_PAGE_SIZE; |
550 | |
|
551 | 0 | uintptr_t start = 0; |
552 | 0 | uintptr_t end = 0; |
553 | 0 | uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start); |
554 | 0 | do { |
555 | 0 | start = huge_start; |
556 | 0 | if (start == 0) { |
557 | | // Initialize the start address after the 32TiB area |
558 | 0 | start = ((uintptr_t)32 << 40); // 32TiB virtual start address |
559 | 0 | #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode |
560 | 0 | mi_heap_t* heap = mi_prim_get_default_heap(); |
561 | | // gh-123022: default heap may not be initialized in CPython in background threads |
562 | 0 | if (mi_heap_is_initialized(heap)) { |
563 | 0 | uintptr_t r = _mi_heap_random_next(heap); |
564 | 0 | start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF)); // (randomly 12bits)*1GiB == between 0 to 4TiB |
565 | 0 | } |
566 | 0 | #endif |
567 | 0 | } |
568 | 0 | end = start + size; |
569 | 0 | mi_assert_internal(end % MI_SEGMENT_SIZE == 0); |
570 | 0 | } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end)); |
571 | |
|
572 | 0 | if (total_size != NULL) *total_size = size; |
573 | 0 | return (uint8_t*)start; |
574 | 0 | } |
575 | | #else |
576 | | static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { |
577 | | MI_UNUSED(pages); |
578 | | if (total_size != NULL) *total_size = 0; |
579 | | return NULL; |
580 | | } |
581 | | #endif |
582 | | |
583 | | // Allocate MI_SEGMENT_SIZE aligned huge pages |
584 | 0 | void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) { |
585 | 0 | *memid = _mi_memid_none(); |
586 | 0 | if (psize != NULL) *psize = 0; |
587 | 0 | if (pages_reserved != NULL) *pages_reserved = 0; |
588 | 0 | size_t size = 0; |
589 | 0 | uint8_t* start = mi_os_claim_huge_pages(pages, &size); |
590 | 0 | if (start == NULL) return NULL; // or 32-bit systems |
591 | | |
592 | | // Allocate one page at the time but try to place them contiguously |
593 | | // We allocate one page at the time to be able to abort if it takes too long |
594 | | // or to at least allocate as many as available on the system. |
595 | 0 | mi_msecs_t start_t = _mi_clock_start(); |
596 | 0 | size_t page = 0; |
597 | 0 | bool all_zero = true; |
598 | 0 | while (page < pages) { |
599 | | // allocate a page |
600 | 0 | bool is_zero = false; |
601 | 0 | void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE); |
602 | 0 | void* p = NULL; |
603 | 0 | int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &is_zero, &p); |
604 | 0 | if (!is_zero) { all_zero = false; } |
605 | 0 | if (err != 0) { |
606 | 0 | _mi_warning_message("unable to allocate huge OS page (error: %d (0x%x), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE); |
607 | 0 | break; |
608 | 0 | } |
609 | | |
610 | | // Did we succeed at a contiguous address? |
611 | 0 | if (p != addr) { |
612 | | // no success, issue a warning and break |
613 | 0 | if (p != NULL) { |
614 | 0 | _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr); |
615 | 0 | mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main); |
616 | 0 | } |
617 | 0 | break; |
618 | 0 | } |
619 | | |
620 | | // success, record it |
621 | 0 | page++; // increase before timeout check (see issue #711) |
622 | 0 | _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); |
623 | 0 | _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE); |
624 | | |
625 | | // check for timeout |
626 | 0 | if (max_msecs > 0) { |
627 | 0 | mi_msecs_t elapsed = _mi_clock_end(start_t); |
628 | 0 | if (page >= 1) { |
629 | 0 | mi_msecs_t estimate = ((elapsed / (page+1)) * pages); |
630 | 0 | if (estimate > 2*max_msecs) { // seems like we are going to timeout, break |
631 | 0 | elapsed = max_msecs + 1; |
632 | 0 | } |
633 | 0 | } |
634 | 0 | if (elapsed > max_msecs) { |
635 | 0 | _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page); |
636 | 0 | break; |
637 | 0 | } |
638 | 0 | } |
639 | 0 | } |
640 | 0 | mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size); |
641 | 0 | if (pages_reserved != NULL) { *pages_reserved = page; } |
642 | 0 | if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; } |
643 | 0 | if (page != 0) { |
644 | 0 | mi_assert(start != NULL); |
645 | 0 | *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */); |
646 | 0 | memid->memkind = MI_MEM_OS_HUGE; |
647 | 0 | mi_assert(memid->is_pinned); |
648 | | #ifdef MI_TRACK_ASAN |
649 | | if (all_zero) { mi_track_mem_defined(start,size); } |
650 | | #endif |
651 | 0 | } |
652 | 0 | return (page == 0 ? NULL : start); |
653 | 0 | } |
654 | | |
655 | | // free every huge page in a range individually (as we allocated per page) |
656 | | // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems. |
657 | 0 | static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) { |
658 | 0 | if (p==NULL || size==0) return; |
659 | 0 | uint8_t* base = (uint8_t*)p; |
660 | 0 | while (size >= MI_HUGE_OS_PAGE_SIZE) { |
661 | 0 | mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats); |
662 | 0 | size -= MI_HUGE_OS_PAGE_SIZE; |
663 | 0 | base += MI_HUGE_OS_PAGE_SIZE; |
664 | 0 | } |
665 | 0 | } |
666 | | |
667 | | /* ---------------------------------------------------------------------------- |
668 | | Support NUMA aware allocation |
669 | | -----------------------------------------------------------------------------*/ |
670 | | |
671 | | _Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count |
672 | | |
673 | 0 | size_t _mi_os_numa_node_count_get(void) { |
674 | 0 | size_t count = mi_atomic_load_acquire(&_mi_numa_node_count); |
675 | 0 | if (count <= 0) { |
676 | 0 | long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly? |
677 | 0 | if (ncount > 0) { |
678 | 0 | count = (size_t)ncount; |
679 | 0 | } |
680 | 0 | else { |
681 | 0 | count = _mi_prim_numa_node_count(); // or detect dynamically |
682 | 0 | if (count == 0) count = 1; |
683 | 0 | } |
684 | 0 | mi_atomic_store_release(&_mi_numa_node_count, count); // save it |
685 | 0 | _mi_verbose_message("using %zd numa regions\n", count); |
686 | 0 | } |
687 | 0 | return count; |
688 | 0 | } |
689 | | |
690 | 0 | int _mi_os_numa_node_get(mi_os_tld_t* tld) { |
691 | 0 | MI_UNUSED(tld); |
692 | 0 | size_t numa_count = _mi_os_numa_node_count(); |
693 | 0 | if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 |
694 | | // never more than the node count and >= 0 |
695 | 0 | size_t numa_node = _mi_prim_numa_node(); |
696 | 0 | if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } |
697 | 0 | return (int)numa_node; |
698 | 0 | } |