1 /* ----------------------------------------------------------------------------
2 Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
3 This is free software; you can redistribute it and/or modify it under the
4 terms of the MIT license. A copy of the license can be found in the file
5 "LICENSE" at the root of this distribution.
6 -----------------------------------------------------------------------------*/
7 #pragma once
8 #ifndef MIMALLOC_PRIM_H
9 #define MIMALLOC_PRIM_H
10 
11 
12 // --------------------------------------------------------------------------
13 // This file specifies the primitive portability API.
14 // Each OS/host needs to implement these primitives, see `src/prim`
15 // for implementations on Window, macOS, WASI, and Linux/Unix.
16 //
17 // note: on all primitive functions, we always have result parameters != NUL, and:
18 //  addr != NULL and page aligned
19 //  size > 0     and page aligned
20 //  return value is an error code an int where 0 is success.
21 // --------------------------------------------------------------------------
22 
23 // OS memory configuration
24 typedef struct mi_os_mem_config_s {
25   size_t  page_size;            // 4KiB
26   size_t  large_page_size;      // 2MiB
27   size_t  alloc_granularity;    // smallest allocation size (on Windows 64KiB)
28   bool    has_overcommit;       // can we reserve more memory than can be actually committed?
29   bool    must_free_whole;      // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
30   bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
31 } mi_os_mem_config_t;
32 
33 // Initialize
34 void _mi_prim_mem_init( mi_os_mem_config_t* config );
35 
36 // Free OS memory
37 int _mi_prim_free(void* addr, size_t size );
38 
39 // Allocate OS memory. Return NULL on error.
40 // The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
41 // If `commit` is false, the virtual memory range only needs to be reserved (with no access)
42 // which will later be committed explicitly using `_mi_prim_commit`.
43 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
44 // pre: !commit => !allow_large
45 //      try_alignment >= _mi_os_page_size() and a power of 2
46 int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
47 
48 // Commit memory. Returns error code or 0 on success.
49 // For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
50 // `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
51 int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
52 
53 // Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
54 // if the memory would need to be re-committed. For example, on Windows this is always true,
55 // but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
56 // pre: needs_recommit != NULL
57 int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
58 
59 // Reset memory. The range keeps being accessible but the content might be reset.
60 // Returns error code or 0 on success.
61 int _mi_prim_reset(void* addr, size_t size);
62 
63 // Protect memory. Returns error code or 0 on success.
64 int _mi_prim_protect(void* addr, size_t size, bool protect);
65 
66 // Allocate huge (1GiB) pages possibly associated with a NUMA node.
67 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
68 // pre: size > 0  and a multiple of 1GiB.
69 //      numa_node is either negative (don't care), or a numa node number.
70 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
71 
72 // Return the current NUMA node
73 size_t _mi_prim_numa_node(void);
74 
75 // Return the number of logical NUMA nodes
76 size_t _mi_prim_numa_node_count(void);
77 
78 // Clock ticks
79 mi_msecs_t _mi_prim_clock_now(void);
80 
81 // Return process information (only for statistics)
82 typedef struct mi_process_info_s {
83   mi_msecs_t  elapsed;
84   mi_msecs_t  utime;
85   mi_msecs_t  stime;
86   size_t      current_rss;
87   size_t      peak_rss;
88   size_t      current_commit;
89   size_t      peak_commit;
90   size_t      page_faults;
91 } mi_process_info_t;
92 
93 void _mi_prim_process_info(mi_process_info_t* pinfo);
94 
95 // Default stderr output. (only for warnings etc. with verbose enabled)
96 // msg != NULL && _mi_strlen(msg) > 0
97 void _mi_prim_out_stderr( const char* msg );
98 
99 // Get an environment variable. (only for options)
100 // name != NULL, result != NULL, result_size >= 64
101 bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
102 
103 
104 // Fill a buffer with strong randomness; return `false` on error or if
105 // there is no strong randomization available.
106 bool _mi_prim_random_buf(void* buf, size_t buf_len);
107 
108 // Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
109 void _mi_prim_thread_init_auto_done(void);
110 
111 // Called on process exit and may take action to clean up resources associated with the thread auto done.
112 void _mi_prim_thread_done_auto_done(void);
113 
114 // Called when the default heap for a thread changes
115 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
116 
117 
118 //-------------------------------------------------------------------
119 // Thread id: `_mi_prim_thread_id()`
120 //
121 // Getting the thread id should be performant as it is called in the
122 // fast path of `_mi_free` and we specialize for various platforms as
123 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
124 // We only require _mi_prim_thread_id() to return a unique id
125 // for each thread (unequal to zero).
126 //-------------------------------------------------------------------
127 
128 // defined in `init.c`; do not use these directly
129 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
130 extern bool _mi_process_is_initialized;             // has mi_process_init been called?
131 
132 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
133 
134 #ifdef MI_PRIM_THREAD_ID
135 
_mi_prim_thread_id(void)136 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
137   return MI_PRIM_THREAD_ID();
138 }
139 
140 #elif defined(_WIN32)
141 
142 #define WIN32_LEAN_AND_MEAN
143 #include <windows.h>
_mi_prim_thread_id(void)144 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
145   // Windows: works on Intel and ARM in both 32- and 64-bit
146   return (uintptr_t)NtCurrentTeb();
147 }
148 
149 // We use assembly for a fast thread id on the main platforms. The TLS layout depends on
150 // both the OS and libc implementation so we use specific tests for each main platform.
151 // If you test on another platform and it works please send a PR :-)
152 // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
153 #elif defined(__GNUC__) && ( \
154            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
155         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
156         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
157         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
158         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
159       )
160 
mi_prim_tls_slot(size_t slot)161 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
162   void* res;
163   const size_t ofs = (slot*sizeof(void*));
164   #if defined(__i386__)
165     __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
166   #elif defined(__APPLE__) && defined(__x86_64__)
167     __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
168   #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
169     __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
170   #elif defined(__x86_64__)
171     __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
172   #elif defined(__arm__)
173     void** tcb; MI_UNUSED(ofs);
174     __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
175     res = tcb[slot];
176   #elif defined(__aarch64__)
177     void** tcb; MI_UNUSED(ofs);
178     #if defined(__APPLE__) // M1, issue #343
179     __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
180     #else
181     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
182     #endif
183     res = tcb[slot];
184   #endif
185   return res;
186 }
187 
188 // setting a tls slot is only used on macOS for now
mi_prim_tls_slot_set(size_t slot,void * value)189 static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
190   const size_t ofs = (slot*sizeof(void*));
191   #if defined(__i386__)
192     __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
193   #elif defined(__APPLE__) && defined(__x86_64__)
194     __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
195   #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
196     __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
197   #elif defined(__x86_64__)
198     __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
199   #elif defined(__arm__)
200     void** tcb; MI_UNUSED(ofs);
201     __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
202     tcb[slot] = value;
203   #elif defined(__aarch64__)
204     void** tcb; MI_UNUSED(ofs);
205     #if defined(__APPLE__) // M1, issue #343
206     __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
207     #else
208     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
209     #endif
210     tcb[slot] = value;
211   #endif
212 }
213 
_mi_prim_thread_id(void)214 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
215   #if defined(__BIONIC__)
216     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
217     // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
218     return (uintptr_t)mi_prim_tls_slot(1);
219   #else
220     // in all our other targets, slot 0 is the thread id
221     // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
222     // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
223     return (uintptr_t)mi_prim_tls_slot(0);
224   #endif
225 }
226 
227 #else
228 
229 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
_mi_prim_thread_id(void)230 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
231   return (uintptr_t)&_mi_heap_default;
232 }
233 
234 #endif
235 
236 
237 
238 /* ----------------------------------------------------------------------------------------
239 The thread local default heap: `_mi_prim_get_default_heap()`
240 This is inlined here as it is on the fast path for allocation functions.
241 
242 On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
243 __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
244 that the storage will always be available (allocated on the thread stacks).
245 
246 On some platforms though we cannot use that when overriding `malloc` since the underlying
247 TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
248 We try to circumvent this in an efficient way:
249 - macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
250            loader itself calls `malloc` even before the modules are initialized.
251 - OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
252 - DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
253 ------------------------------------------------------------------------------------------- */
254 
255 static inline mi_heap_t* mi_prim_get_default_heap(void);
256 
257 #if defined(MI_MALLOC_OVERRIDE)
258 #if defined(__APPLE__) // macOS
259   #define MI_TLS_SLOT               89  // seems unused?
260   // #define MI_TLS_RECURSE_GUARD 1
261   // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
262   // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
263 #elif defined(__OpenBSD__)
264   // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
265   // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
266   #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
267   // #elif defined(__DragonFly__)
268   // #warning "mimalloc is not working correctly on DragonFly yet."
269   // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
270 #elif defined(__ANDROID__)
271   // See issue #381
272   #define MI_TLS_PTHREAD
273 #endif
274 #endif
275 
276 
277 #if defined(MI_TLS_SLOT)
278 
mi_prim_get_default_heap(void)279 static inline mi_heap_t* mi_prim_get_default_heap(void) {
280   mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
281   if mi_unlikely(heap == NULL) {
282     #ifdef __GNUC__
283     __asm(""); // prevent conditional load of the address of _mi_heap_empty
284     #endif
285     heap = (mi_heap_t*)&_mi_heap_empty;
286   }
287   return heap;
288 }
289 
290 #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
291 
mi_prim_tls_pthread_heap_slot(void)292 static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
293   pthread_t self = pthread_self();
294   #if defined(__DragonFly__)
295   if (self==NULL) return NULL;
296   #endif
297   return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
298 }
299 
mi_prim_get_default_heap(void)300 static inline mi_heap_t* mi_prim_get_default_heap(void) {
301   mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
302   if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
303   mi_heap_t* heap = *pheap;
304   if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
305   return heap;
306 }
307 
308 #elif defined(MI_TLS_PTHREAD)
309 
310 extern pthread_key_t _mi_heap_default_key;
mi_prim_get_default_heap(void)311 static inline mi_heap_t* mi_prim_get_default_heap(void) {
312   mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
313   return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
314 }
315 
316 #else // default using a thread local variable; used on most platforms.
317 
mi_prim_get_default_heap(void)318 static inline mi_heap_t* mi_prim_get_default_heap(void) {
319   #if defined(MI_TLS_RECURSE_GUARD)
320   if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
321   #endif
322   return _mi_heap_default;
323 }
324 
325 #endif  // mi_prim_get_default_heap()
326 
327 
328 
329 #endif  // MIMALLOC_PRIM_H
330