/src/cpython/Include/internal/mimalloc/mimalloc/prim.h

Source
/* ----------------------------------------------------------------------------
Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
This is free software; you can redistribute it and/or modify it under the
terms of the MIT license. A copy of the license can be found in the file
"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
#pragma once
#ifndef MIMALLOC_PRIM_H
#define MIMALLOC_PRIM_H


// --------------------------------------------------------------------------
// This file specifies the primitive portability API.
// Each OS/host needs to implement these primitives, see `src/prim`
// for implementations on Window, macOS, WASI, and Linux/Unix.
//
// note: on all primitive functions, we always have result parameters != NUL, and:
//  addr != NULL and page aligned
//  size > 0     and page aligned
//  return value is an error code an int where 0 is success.
// --------------------------------------------------------------------------

// OS memory configuration
typedef struct mi_os_mem_config_s {
  size_t  page_size;            // 4KiB
  size_t  large_page_size;      // 2MiB
  size_t  alloc_granularity;    // smallest allocation size (on Windows 64KiB)
  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
  bool    must_free_whole;      // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
} mi_os_mem_config_t;

// Initialize
void _mi_prim_mem_init( mi_os_mem_config_t* config );

// Free OS memory
int _mi_prim_free(void* addr, size_t size );

// Allocate OS memory. Return NULL on error.
// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
// which will later be committed explicitly using `_mi_prim_commit`.
// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
// pre: !commit => !allow_large
//      try_alignment >= _mi_os_page_size() and a power of 2
int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);

// Commit memory. Returns error code or 0 on success.
// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
int _mi_prim_commit(void* addr, size_t size, bool* is_zero);

// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
// if the memory would need to be re-committed. For example, on Windows this is always true,
// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
// pre: needs_recommit != NULL
int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);

// Reset memory. The range keeps being accessible but the content might be reset.
// Returns error code or 0 on success.
int _mi_prim_reset(void* addr, size_t size);

// Protect memory. Returns error code or 0 on success.
int _mi_prim_protect(void* addr, size_t size, bool protect);

// Allocate huge (1GiB) pages possibly associated with a NUMA node.
// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
// pre: size > 0  and a multiple of 1GiB.
//      numa_node is either negative (don't care), or a numa node number.
int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);

// Return the current NUMA node
size_t _mi_prim_numa_node(void);

// Return the number of logical NUMA nodes
size_t _mi_prim_numa_node_count(void);

// Clock ticks
mi_msecs_t _mi_prim_clock_now(void);

// Return process information (only for statistics)
typedef struct mi_process_info_s {
  mi_msecs_t  elapsed;
  mi_msecs_t  utime;
  mi_msecs_t  stime;
  size_t      current_rss;
  size_t      peak_rss;
  size_t      current_commit;
  size_t      peak_commit;
  size_t      page_faults;
} mi_process_info_t;

void _mi_prim_process_info(mi_process_info_t* pinfo);

// Default stderr output. (only for warnings etc. with verbose enabled)
// msg != NULL && _mi_strlen(msg) > 0
void _mi_prim_out_stderr( const char* msg );

// Get an environment variable. (only for options)
// name != NULL, result != NULL, result_size >= 64
bool _mi_prim_getenv(const char* name, char* result, size_t result_size);


// Fill a buffer with strong randomness; return `false` on error or if
// there is no strong randomization available.
bool _mi_prim_random_buf(void* buf, size_t buf_len);

// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
void _mi_prim_thread_init_auto_done(void);

// Called on process exit and may take action to clean up resources associated with the thread auto done.
void _mi_prim_thread_done_auto_done(void);

// Called when the default heap for a thread changes
void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);


//-------------------------------------------------------------------
// Thread id: `_mi_prim_thread_id()`
//
// Getting the thread id should be performant as it is called in the
// fast path of `_mi_free` and we specialize for various platforms as
// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
// We only require _mi_prim_thread_id() to return a unique id
// for each thread (unequal to zero).
//-------------------------------------------------------------------

// defined in `init.c`; do not use these directly
extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
extern bool _mi_process_is_initialized;             // has mi_process_init been called?

static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;

#ifdef MI_PRIM_THREAD_ID

static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
  return MI_PRIM_THREAD_ID();
}

#elif defined(_WIN32)

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
}

// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
// both the OS and libc implementation so we use specific tests for each main platform.
// If you test on another platform and it works please send a PR :-)
// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
#elif defined(__GNUC__) && ( \
           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
      )

static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
  void* res;
  const size_t ofs = (slot*sizeof(void*));
  #if defined(__i386__)
    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
  #elif defined(__APPLE__) && defined(__x86_64__)
    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
  #elif defined(__x86_64__)
    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
  #elif defined(__arm__)
    void** tcb; MI_UNUSED(ofs);
    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
    res = tcb[slot];
  #elif defined(__aarch64__)
    void** tcb; MI_UNUSED(ofs);
    #if defined(__APPLE__) // M1, issue #343
    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
    #else
    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
    #endif
    res = tcb[slot];
  #endif
  return res;
}

// setting a tls slot is only used on macOS for now
static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
  const size_t ofs = (slot*sizeof(void*));
  #if defined(__i386__)
    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
  #elif defined(__APPLE__) && defined(__x86_64__)
    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
  #elif defined(__x86_64__)
    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
  #elif defined(__arm__)
    void** tcb; MI_UNUSED(ofs);
    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
    tcb[slot] = value;
  #elif defined(__aarch64__)
    void** tcb; MI_UNUSED(ofs);
    #if defined(__APPLE__) // M1, issue #343
    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
    #else
    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
    #endif
    tcb[slot] = value;
  #endif
}

static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
  #if defined(__BIONIC__)
    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
    return (uintptr_t)mi_prim_tls_slot(1);
  #else
    // in all our other targets, slot 0 is the thread id
    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
    return (uintptr_t)mi_prim_tls_slot(0);
  #endif
}

#else

// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
  return (uintptr_t)&_mi_heap_default;
}

#endif



/* ----------------------------------------------------------------------------------------
The thread local default heap: `_mi_prim_get_default_heap()`
This is inlined here as it is on the fast path for allocation functions.

On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
that the storage will always be available (allocated on the thread stacks).

On some platforms though we cannot use that when overriding `malloc` since the underlying
TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
We try to circumvent this in an efficient way:
- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
           loader itself calls `malloc` even before the modules are initialized.
- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
------------------------------------------------------------------------------------------- */

static inline mi_heap_t* mi_prim_get_default_heap(void);

#if defined(MI_MALLOC_OVERRIDE)
#if defined(__APPLE__) // macOS
  #define MI_TLS_SLOT               89  // seems unused?
  // #define MI_TLS_RECURSE_GUARD 1
  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
#elif defined(__OpenBSD__)
  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
  // #elif defined(__DragonFly__)
  // #warning "mimalloc is not working correctly on DragonFly yet."
  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
#elif defined(__ANDROID__)
  // See issue #381
  #define MI_TLS_PTHREAD
#endif
#endif


#if defined(MI_TLS_SLOT)

static inline mi_heap_t* mi_prim_get_default_heap(void) {
  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
  if mi_unlikely(heap == NULL) {
    #ifdef __GNUC__
    __asm(""); // prevent conditional load of the address of _mi_heap_empty
    #endif
    heap = (mi_heap_t*)&_mi_heap_empty;
  }
  return heap;
}

#elif defined(MI_TLS_PTHREAD_SLOT_OFS)

static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
  pthread_t self = pthread_self();
  #if defined(__DragonFly__)
  if (self==NULL) return NULL;
  #endif
  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
}

static inline mi_heap_t* mi_prim_get_default_heap(void) {
  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
  mi_heap_t* heap = *pheap;
  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
  return heap;
}

#elif defined(MI_TLS_PTHREAD)

extern pthread_key_t _mi_heap_default_key;
static inline mi_heap_t* mi_prim_get_default_heap(void) {
  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
}

#else // default using a thread local variable; used on most platforms.

static inline mi_heap_t* mi_prim_get_default_heap(void) {
  #if defined(MI_TLS_RECURSE_GUARD)
  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
  #endif
  return _mi_heap_default;
}

#endif  // mi_prim_get_default_heap()



#endif  // MIMALLOC_PRIM_H

Coverage Report

Created: 2026-04-20 06:11

Line	Count	Source
1		/* ----------------------------------------------------------------------------
2		Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
3		This is free software; you can redistribute it and/or modify it under the
4		terms of the MIT license. A copy of the license can be found in the file
5		"LICENSE" at the root of this distribution.
6		-----------------------------------------------------------------------------*/
7		#pragma once
8		#ifndef MIMALLOC_PRIM_H
9		#define MIMALLOC_PRIM_H
10
11
12		// --------------------------------------------------------------------------
13		// This file specifies the primitive portability API.
14		// Each OS/host needs to implement these primitives, see `src/prim`
15		// for implementations on Window, macOS, WASI, and Linux/Unix.
16		//
17		// note: on all primitive functions, we always have result parameters != NUL, and:
18		// addr != NULL and page aligned
19		// size > 0 and page aligned
20		// return value is an error code an int where 0 is success.
21		// --------------------------------------------------------------------------
22
23		// OS memory configuration
24		typedef struct mi_os_mem_config_s {
25		size_t page_size; // 4KiB
26		size_t large_page_size; // 2MiB
27		size_t alloc_granularity; // smallest allocation size (on Windows 64KiB)
28		bool has_overcommit; // can we reserve more memory than can be actually committed?
29		bool must_free_whole; // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
30		bool has_virtual_reserve; // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
31		} mi_os_mem_config_t;
32
33		// Initialize
34		void _mi_prim_mem_init( mi_os_mem_config_t* config );
35
36		// Free OS memory
37		int _mi_prim_free(void* addr, size_t size );
38
39		// Allocate OS memory. Return NULL on error.
40		// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
41		// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
42		// which will later be committed explicitly using `_mi_prim_commit`.
43		// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
44		// pre: !commit => !allow_large
45		// try_alignment >= _mi_os_page_size() and a power of 2
46		int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
47
48		// Commit memory. Returns error code or 0 on success.
49		// For example, on Linux this would make the memory PROT_READ\|PROT_WRITE.
50		// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
51		int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
52
53		// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
54		// if the memory would need to be re-committed. For example, on Windows this is always true,
55		// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
56		// pre: needs_recommit != NULL
57		int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
58
59		// Reset memory. The range keeps being accessible but the content might be reset.
60		// Returns error code or 0 on success.
61		int _mi_prim_reset(void* addr, size_t size);
62
63		// Protect memory. Returns error code or 0 on success.
64		int _mi_prim_protect(void* addr, size_t size, bool protect);
65
66		// Allocate huge (1GiB) pages possibly associated with a NUMA node.
67		// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
68		// pre: size > 0 and a multiple of 1GiB.
69		// numa_node is either negative (don't care), or a numa node number.
70		int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
71
72		// Return the current NUMA node
73		size_t _mi_prim_numa_node(void);
74
75		// Return the number of logical NUMA nodes
76		size_t _mi_prim_numa_node_count(void);
77
78		// Clock ticks
79		mi_msecs_t _mi_prim_clock_now(void);
80
81		// Return process information (only for statistics)
82		typedef struct mi_process_info_s {
83		mi_msecs_t elapsed;
84		mi_msecs_t utime;
85		mi_msecs_t stime;
86		size_t current_rss;
87		size_t peak_rss;
88		size_t current_commit;
89		size_t peak_commit;
90		size_t page_faults;
91		} mi_process_info_t;
92
93		void _mi_prim_process_info(mi_process_info_t* pinfo);
94
95		// Default stderr output. (only for warnings etc. with verbose enabled)
96		// msg != NULL && _mi_strlen(msg) > 0
97		void _mi_prim_out_stderr( const char* msg );
98
99		// Get an environment variable. (only for options)
100		// name != NULL, result != NULL, result_size >= 64
101		bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
102
103
104		// Fill a buffer with strong randomness; return `false` on error or if
105		// there is no strong randomization available.
106		bool _mi_prim_random_buf(void* buf, size_t buf_len);
107
108		// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
109		void _mi_prim_thread_init_auto_done(void);
110
111		// Called on process exit and may take action to clean up resources associated with the thread auto done.
112		void _mi_prim_thread_done_auto_done(void);
113
114		// Called when the default heap for a thread changes
115		void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
116
117
118		//-------------------------------------------------------------------
119		// Thread id: `_mi_prim_thread_id()`
120		//
121		// Getting the thread id should be performant as it is called in the
122		// fast path of `_mi_free` and we specialize for various platforms as
123		// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
124		// We only require _mi_prim_thread_id() to return a unique id
125		// for each thread (unequal to zero).
126		//-------------------------------------------------------------------
127
128		// defined in `init.c`; do not use these directly
129		extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from
130		extern bool _mi_process_is_initialized; // has mi_process_init been called?
131
132		static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
133
134		#ifdef MI_PRIM_THREAD_ID
135
136		static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
137		return MI_PRIM_THREAD_ID();
138		}
139
140		#elif defined(_WIN32)
141
142		#define WIN32_LEAN_AND_MEAN
143		#include <windows.h>
144		static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
145		// Windows: works on Intel and ARM in both 32- and 64-bit
146		return (uintptr_t)NtCurrentTeb();
147		}
148
149		// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
150		// both the OS and libc implementation so we use specific tests for each main platform.
151		// If you test on another platform and it works please send a PR :-)
152		// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
153		#elif defined(__GNUC__) && ( \
154		(defined(__GLIBC__) && (defined(__x86_64__) \|\| defined(__i386__) \|\| (defined(__arm__) && __ARM_ARCH >= 7) \|\| defined(__aarch64__))) \
155		\|\| (defined(__APPLE__) && (defined(__x86_64__) \|\| defined(__aarch64__))) \
156		\|\| (defined(__BIONIC__) && (defined(__x86_64__) \|\| defined(__i386__) \|\| (defined(__arm__) && __ARM_ARCH >= 7) \|\| defined(__aarch64__))) \
157		\|\| (defined(__FreeBSD__) && (defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__aarch64__))) \
158		\|\| (defined(__OpenBSD__) && (defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__aarch64__))) \
159		)
160
161	72	static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
162	72	void* res;
163	72	const size_t ofs = (slotsizeof(void));
164		#if defined(__i386__)
165		__asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (((void*)ofs)) : ); // x86 32-bit always uses GS
166		#elif defined(__APPLE__) && defined(__x86_64__)
167		__asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (((void*)ofs)) : ); // x86_64 macOSX uses GS
168		#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
169		__asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (((void*)ofs)) : ); // x32 ABI
170		#elif defined(__x86_64__)
171		__asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (((void*)ofs)) : ); // x86_64 Linux, BSD uses FS
172		#elif defined(__arm__)
173		void** tcb; MI_UNUSED(ofs);
174		__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
175		res = tcb[slot];
176		#elif defined(__aarch64__)
177		void** tcb; MI_UNUSED(ofs);
178		#if defined(__APPLE__) // M1, issue #343
179		__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
180		#else
181		__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
182		#endif
183		res = tcb[slot];
184		#endif
185	72	return res;
186	72	}
187
188		// setting a tls slot is only used on macOS for now
189	0	static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
190	0	const size_t ofs = (slotsizeof(void));
191	0	#if defined(__i386__)
192	0	__asm__("movl %1,%%gs:%0" : "=m" (((void*)ofs)) : "rn" (value) : ); // 32-bit always uses GS
193	0	#elif defined(__APPLE__) && defined(__x86_64__)
194	0	__asm__("movq %1,%%gs:%0" : "=m" (((void*)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS
195	0	#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
196	0	__asm__("movl %1,%%fs:%0" : "=m" (((void*)ofs)) : "rn" (value) : ); // x32 ABI
197	0	#elif defined(__x86_64__)
198	0	__asm__("movq %1,%%fs:%0" : "=m" (((void*)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS
199	0	#elif defined(__arm__)
200	0	void** tcb; MI_UNUSED(ofs);
201	0	__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
202	0	tcb[slot] = value;
203	0	#elif defined(__aarch64__)
204	0	void** tcb; MI_UNUSED(ofs);
205	0	#if defined(__APPLE__) // M1, issue #343
206	0	__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
207	0	#else
208	0	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
209	0	#endif
210	0	tcb[slot] = value;
211	0	#endif
212	0	}
213
214	72	static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
215		#if defined(__BIONIC__)
216		// issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
217		// see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
218		return (uintptr_t)mi_prim_tls_slot(1);
219		#else
220		// in all our other targets, slot 0 is the thread id
221		// glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
222		// apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
223	72	return (uintptr_t)mi_prim_tls_slot(0);
224	72	#endif
225	72	}
226
227		#else
228
229		// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
230		static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
231		return (uintptr_t)&_mi_heap_default;
232		}
233
234		#endif
235
236
237
238		/* ----------------------------------------------------------------------------------------
239		The thread local default heap: `_mi_prim_get_default_heap()`
240		This is inlined here as it is on the fast path for allocation functions.
241
242		On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
243		__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
244		that the storage will always be available (allocated on the thread stacks).
245
246		On some platforms though we cannot use that when overriding `malloc` since the underlying
247		TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
248		We try to circumvent this in an efficient way:
249		- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
250		loader itself calls `malloc` even before the modules are initialized.
251		- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
252		- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
253		------------------------------------------------------------------------------------------- */
254
255		static inline mi_heap_t* mi_prim_get_default_heap(void);
256
257		#if defined(MI_MALLOC_OVERRIDE)
258		#if defined(__APPLE__) // macOS
259		#define MI_TLS_SLOT 89 // seems unused?
260		// #define MI_TLS_RECURSE_GUARD 1
261		// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
262		// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
263		#elif defined(__OpenBSD__)
264		// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
265		// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
266		#define MI_TLS_PTHREAD_SLOT_OFS (6sizeof(int) + 4sizeof(void*) + 24)
267		// #elif defined(__DragonFly__)
268		// #warning "mimalloc is not working correctly on DragonFly yet."
269		// #define MI_TLS_PTHREAD_SLOT_OFS (4 + 1sizeof(void)) // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
270		#elif defined(__ANDROID__)
271		// See issue #381
272		#define MI_TLS_PTHREAD
273		#endif
274		#endif
275
276
277		#if defined(MI_TLS_SLOT)
278
279		static inline mi_heap_t* mi_prim_get_default_heap(void) {
280		mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
281		if mi_unlikely(heap == NULL) {
282		#ifdef __GNUC__
283		__asm(""); // prevent conditional load of the address of _mi_heap_empty
284		#endif
285		heap = (mi_heap_t*)&_mi_heap_empty;
286		}
287		return heap;
288		}
289
290		#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
291
292		static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
293		pthread_t self = pthread_self();
294		#if defined(__DragonFly__)
295		if (self==NULL) return NULL;
296		#endif
297		return (mi_heap_t*)((uint8_t)self + MI_TLS_PTHREAD_SLOT_OFS);
298		}
299
300		static inline mi_heap_t* mi_prim_get_default_heap(void) {
301		mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
302		if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
303		mi_heap_t* heap = *pheap;
304		if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
305		return heap;
306		}
307
308		#elif defined(MI_TLS_PTHREAD)
309
310		extern pthread_key_t _mi_heap_default_key;
311		static inline mi_heap_t* mi_prim_get_default_heap(void) {
312		mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
313		return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
314		}
315
316		#else // default using a thread local variable; used on most platforms.
317
318	108	static inline mi_heap_t* mi_prim_get_default_heap(void) {
319		#if defined(MI_TLS_RECURSE_GUARD)
320		if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
321		#endif
322	108	return _mi_heap_default;
323	108	}
324
325		#endif // mi_prim_get_default_heap()
326
327
328
329		#endif // MIMALLOC_PRIM_H