1 /******************************************************************************/
2 #ifdef JEMALLOC_H_TYPES
3
4 typedef struct prof_bt_s prof_bt_t;
5 typedef struct prof_cnt_s prof_cnt_t;
6 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7 typedef struct prof_ctx_s prof_ctx_t;
8 typedef struct prof_tdata_s prof_tdata_t;
9
10 /* Option defaults. */
11 #ifdef JEMALLOC_PROF
12 # define PROF_PREFIX_DEFAULT "jeprof"
13 #else
14 # define PROF_PREFIX_DEFAULT ""
15 #endif
16 #define LG_PROF_SAMPLE_DEFAULT 19
17 #define LG_PROF_INTERVAL_DEFAULT -1
18
19 /*
20 * Hard limit on stack backtrace depth. The version of prof_backtrace() that
21 * is based on __builtin_return_address() necessarily has a hard-coded number
22 * of backtrace frame handlers, and should be kept in sync with this setting.
23 */
24 #define PROF_BT_MAX 128
25
26 /* Maximum number of backtraces to store in each per thread LRU cache. */
27 #define PROF_TCMAX 1024
28
29 /* Initial hash table size. */
30 #define PROF_CKH_MINITEMS 64
31
32 /* Size of memory buffer to use when writing dump files. */
33 #define PROF_DUMP_BUFSIZE 65536
34
35 /* Size of stack-allocated buffer used by prof_printf(). */
36 #define PROF_PRINTF_BUFSIZE 128
37
38 /*
39 * Number of mutexes shared among all ctx's. No space is allocated for these
40 * unless profiling is enabled, so it's okay to over-provision.
41 */
42 #define PROF_NCTX_LOCKS 1024
43
44 /*
45 * prof_tdata pointers close to NULL are used to encode state information that
46 * is used for cleaning up during thread shutdown.
47 */
48 #define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1)
49 #define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2)
50 #define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY
51
52 #endif /* JEMALLOC_H_TYPES */
53 /******************************************************************************/
54 #ifdef JEMALLOC_H_STRUCTS
55
56 struct prof_bt_s {
57 /* Backtrace, stored as len program counters. */
58 void **vec;
59 unsigned len;
60 };
61
62 #ifdef JEMALLOC_PROF_LIBGCC
63 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
64 typedef struct {
65 prof_bt_t *bt;
66 unsigned max;
67 } prof_unwind_data_t;
68 #endif
69
70 struct prof_cnt_s {
71 /*
72 * Profiling counters. An allocation/deallocation pair can operate on
73 * different prof_thr_cnt_t objects that are linked into the same
74 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
75 * negative. In principle it is possible for the *bytes counters to
76 * overflow/underflow, but a general solution would require something
77 * like 128-bit counters; this implementation doesn't bother to solve
78 * that problem.
79 */
80 int64_t curobjs;
81 int64_t curbytes;
82 uint64_t accumobjs;
83 uint64_t accumbytes;
84 };
85
86 struct prof_thr_cnt_s {
87 /* Linkage into prof_ctx_t's cnts_ql. */
88 ql_elm(prof_thr_cnt_t) cnts_link;
89
90 /* Linkage into thread's LRU. */
91 ql_elm(prof_thr_cnt_t) lru_link;
92
93 /*
94 * Associated context. If a thread frees an object that it did not
95 * allocate, it is possible that the context is not cached in the
96 * thread's hash table, in which case it must be able to look up the
97 * context, insert a new prof_thr_cnt_t into the thread's hash table,
98 * and link it into the prof_ctx_t's cnts_ql.
99 */
100 prof_ctx_t *ctx;
101
102 /*
103 * Threads use memory barriers to update the counters. Since there is
104 * only ever one writer, the only challenge is for the reader to get a
105 * consistent read of the counters.
106 *
107 * The writer uses this series of operations:
108 *
109 * 1) Increment epoch to an odd number.
110 * 2) Update counters.
111 * 3) Increment epoch to an even number.
112 *
113 * The reader must assure 1) that the epoch is even while it reads the
114 * counters, and 2) that the epoch doesn't change between the time it
115 * starts and finishes reading the counters.
116 */
117 unsigned epoch;
118
119 /* Profiling counters. */
120 prof_cnt_t cnts;
121 };
122
123 struct prof_ctx_s {
124 /* Associated backtrace. */
125 prof_bt_t *bt;
126
127 /* Protects nlimbo, cnt_merged, and cnts_ql. */
128 malloc_mutex_t *lock;
129
130 /*
131 * Number of threads that currently cause this ctx to be in a state of
132 * limbo due to one of:
133 * - Initializing per thread counters associated with this ctx.
134 * - Preparing to destroy this ctx.
135 * - Dumping a heap profile that includes this ctx.
136 * nlimbo must be 1 (single destroyer) in order to safely destroy the
137 * ctx.
138 */
139 unsigned nlimbo;
140
141 /* Temporary storage for summation during dump. */
142 prof_cnt_t cnt_summed;
143
144 /* When threads exit, they merge their stats into cnt_merged. */
145 prof_cnt_t cnt_merged;
146
147 /*
148 * List of profile counters, one for each thread that has allocated in
149 * this context.
150 */
151 ql_head(prof_thr_cnt_t) cnts_ql;
152
153 /* Linkage for list of contexts to be dumped. */
154 ql_elm(prof_ctx_t) dump_link;
155 };
156 typedef ql_head(prof_ctx_t) prof_ctx_list_t;
157
158 struct prof_tdata_s {
159 /*
160 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a
161 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
162 * objects. Other threads may read the prof_thr_cnt_t contents, but no
163 * others will ever write them.
164 *
165 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
166 * counter data into the associated prof_ctx_t objects, and unlink/free
167 * the prof_thr_cnt_t objects.
168 */
169 ckh_t bt2cnt;
170
171 /* LRU for contents of bt2cnt. */
172 ql_head(prof_thr_cnt_t) lru_ql;
173
174 /* Backtrace vector, used for calls to prof_backtrace(). */
175 void **vec;
176
177 /* Sampling state. */
178 uint64_t prng_state;
179 uint64_t bytes_until_sample;
180
181 /* State used to avoid dumping while operating on prof internals. */
182 bool enq;
183 bool enq_idump;
184 bool enq_gdump;
185 };
186
187 #endif /* JEMALLOC_H_STRUCTS */
188 /******************************************************************************/
189 #ifdef JEMALLOC_H_EXTERNS
190
191 extern bool opt_prof;
192 /*
193 * Even if opt_prof is true, sampling can be temporarily disabled by setting
194 * opt_prof_active to false. No locking is used when updating opt_prof_active,
195 * so there are no guarantees regarding how long it will take for all threads
196 * to notice state changes.
197 */
198 extern bool opt_prof_active;
199 extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
200 extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
201 extern bool opt_prof_gdump; /* High-water memory dumping. */
202 extern bool opt_prof_final; /* Final profile dumping. */
203 extern bool opt_prof_leak; /* Dump leak summary at exit. */
204 extern bool opt_prof_accum; /* Report cumulative bytes. */
205 extern char opt_prof_prefix[
206 /* Minimize memory bloat for non-prof builds. */
207 #ifdef JEMALLOC_PROF
208 PATH_MAX +
209 #endif
210 1];
211
212 /*
213 * Profile dump interval, measured in bytes allocated. Each arena triggers a
214 * profile dump when it reaches this threshold. The effect is that the
215 * interval between profile dumps averages prof_interval, though the actual
216 * interval between dumps will tend to be sporadic, and the interval will be a
217 * maximum of approximately (prof_interval * narenas).
218 */
219 extern uint64_t prof_interval;
220
221 void bt_init(prof_bt_t *bt, void **vec);
222 void prof_backtrace(prof_bt_t *bt);
223 prof_thr_cnt_t *prof_lookup(prof_bt_t *bt);
224 #ifdef JEMALLOC_JET
225 size_t prof_bt_count(void);
226 typedef int (prof_dump_open_t)(bool, const char *);
227 extern prof_dump_open_t *prof_dump_open;
228 #endif
229 void prof_idump(void);
230 bool prof_mdump(const char *filename);
231 void prof_gdump(void);
232 prof_tdata_t *prof_tdata_init(void);
233 void prof_tdata_cleanup(void *arg);
234 void prof_boot0(void);
235 void prof_boot1(void);
236 bool prof_boot2(void);
237 void prof_prefork(void);
238 void prof_postfork_parent(void);
239 void prof_postfork_child(void);
240 void prof_sample_threshold_update(prof_tdata_t *prof_tdata);
241
242 #endif /* JEMALLOC_H_EXTERNS */
243 /******************************************************************************/
244 #ifdef JEMALLOC_H_INLINES
245
246 #define PROF_ALLOC_PREP(size, ret) do { \
247 prof_tdata_t *prof_tdata; \
248 prof_bt_t bt; \
249 \
250 assert(size == s2u(size)); \
251 \
252 if (!opt_prof_active || \
253 prof_sample_accum_update(size, false, &prof_tdata)) { \
254 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
255 } else { \
256 bt_init(&bt, prof_tdata->vec); \
257 prof_backtrace(&bt); \
258 ret = prof_lookup(&bt); \
259 } \
260 } while (0)
261
262 #ifndef JEMALLOC_ENABLE_INLINE
263 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
264
265 prof_tdata_t *prof_tdata_get(bool create);
266 bool prof_sample_accum_update(size_t size, bool commit,
267 prof_tdata_t **prof_tdata_out);
268 prof_ctx_t *prof_ctx_get(const void *ptr);
269 void prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
270 void prof_malloc_record_object(const void *ptr, size_t usize,
271 prof_thr_cnt_t *cnt);
272 void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
273 void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
274 size_t old_usize, prof_ctx_t *old_ctx);
275 void prof_free(const void *ptr, size_t size);
276 #endif
277
278 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
279 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
malloc_tsd_externs(prof_tdata,prof_tdata_t *)280 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
281 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
282 prof_tdata_cleanup)
283
284 JEMALLOC_INLINE prof_tdata_t *
285 prof_tdata_get(bool create)
286 {
287 prof_tdata_t *prof_tdata;
288
289 cassert(config_prof);
290
291 prof_tdata = *prof_tdata_tsd_get();
292 if (create && prof_tdata == NULL)
293 prof_tdata = prof_tdata_init();
294
295 return (prof_tdata);
296 }
297
298 JEMALLOC_INLINE prof_ctx_t *
prof_ctx_get(const void * ptr)299 prof_ctx_get(const void *ptr)
300 {
301 prof_ctx_t *ret;
302 arena_chunk_t *chunk;
303
304 cassert(config_prof);
305 assert(ptr != NULL);
306
307 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
308 if (chunk != ptr) {
309 /* Region. */
310 ret = arena_prof_ctx_get(ptr);
311 } else
312 ret = huge_prof_ctx_get(ptr);
313
314 return (ret);
315 }
316
317 JEMALLOC_INLINE void
prof_ctx_set(const void * ptr,prof_ctx_t * ctx)318 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
319 {
320 arena_chunk_t *chunk;
321
322 cassert(config_prof);
323 assert(ptr != NULL);
324
325 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
326 if (chunk != ptr) {
327 /* Region. */
328 arena_prof_ctx_set(ptr, ctx);
329 } else
330 huge_prof_ctx_set(ptr, ctx);
331 }
332
333 JEMALLOC_INLINE bool
prof_sample_accum_update(size_t size,bool commit,prof_tdata_t ** prof_tdata_out)334 prof_sample_accum_update(size_t size, bool commit,
335 prof_tdata_t **prof_tdata_out)
336 {
337 prof_tdata_t *prof_tdata;
338
339 cassert(config_prof);
340
341 prof_tdata = prof_tdata_get(true);
342 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
343 prof_tdata = NULL;
344
345 if (prof_tdata_out != NULL)
346 *prof_tdata_out = prof_tdata;
347
348 if (prof_tdata == NULL)
349 return (true);
350
351 if (prof_tdata->bytes_until_sample >= size) {
352 if (commit)
353 prof_tdata->bytes_until_sample -= size;
354 return (true);
355 } else {
356 /* Compute new sample threshold. */
357 if (commit)
358 prof_sample_threshold_update(prof_tdata);
359 return (false);
360 }
361 }
362
363 JEMALLOC_INLINE void
prof_malloc_record_object(const void * ptr,size_t usize,prof_thr_cnt_t * cnt)364 prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
365 prof_ctx_set(ptr, cnt->ctx);
366
367 cnt->epoch++;
368 /*********/
369 mb_write();
370 /*********/
371 cnt->cnts.curobjs++;
372 cnt->cnts.curbytes += usize;
373 if (opt_prof_accum) {
374 cnt->cnts.accumobjs++;
375 cnt->cnts.accumbytes += usize;
376 }
377 /*********/
378 mb_write();
379 /*********/
380 cnt->epoch++;
381 /*********/
382 mb_write();
383 /*********/
384 }
385
386 JEMALLOC_INLINE void
prof_malloc(const void * ptr,size_t usize,prof_thr_cnt_t * cnt)387 prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
388 {
389
390 cassert(config_prof);
391 assert(ptr != NULL);
392 assert(usize == isalloc(ptr, true));
393
394 if (prof_sample_accum_update(usize, true, NULL)) {
395 /*
396 * Don't sample. For malloc()-like allocation, it is
397 * always possible to tell in advance how large an
398 * object's usable size will be, so there should never
399 * be a difference between the usize passed to
400 * PROF_ALLOC_PREP() and prof_malloc().
401 */
402 assert((uintptr_t)cnt == (uintptr_t)1U);
403 }
404
405 if ((uintptr_t)cnt > (uintptr_t)1U)
406 prof_malloc_record_object(ptr, usize, cnt);
407 else
408 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
409 }
410
411 JEMALLOC_INLINE void
prof_realloc(const void * ptr,size_t usize,prof_thr_cnt_t * cnt,size_t old_usize,prof_ctx_t * old_ctx)412 prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
413 size_t old_usize, prof_ctx_t *old_ctx)
414 {
415 prof_thr_cnt_t *told_cnt;
416
417 cassert(config_prof);
418 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
419
420 if (ptr != NULL) {
421 assert(usize == isalloc(ptr, true));
422 if (prof_sample_accum_update(usize, true, NULL)) {
423 /*
424 * Don't sample. The usize passed to
425 * PROF_ALLOC_PREP() was larger than what
426 * actually got allocated, so a backtrace was
427 * captured for this allocation, even though
428 * its actual usize was insufficient to cross
429 * the sample threshold.
430 */
431 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
432 }
433 }
434
435 if ((uintptr_t)old_ctx > (uintptr_t)1U) {
436 told_cnt = prof_lookup(old_ctx->bt);
437 if (told_cnt == NULL) {
438 /*
439 * It's too late to propagate OOM for this realloc(),
440 * so operate directly on old_cnt->ctx->cnt_merged.
441 */
442 malloc_mutex_lock(old_ctx->lock);
443 old_ctx->cnt_merged.curobjs--;
444 old_ctx->cnt_merged.curbytes -= old_usize;
445 malloc_mutex_unlock(old_ctx->lock);
446 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
447 }
448 } else
449 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
450
451 if ((uintptr_t)told_cnt > (uintptr_t)1U)
452 told_cnt->epoch++;
453 if ((uintptr_t)cnt > (uintptr_t)1U) {
454 prof_ctx_set(ptr, cnt->ctx);
455 cnt->epoch++;
456 } else if (ptr != NULL)
457 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
458 /*********/
459 mb_write();
460 /*********/
461 if ((uintptr_t)told_cnt > (uintptr_t)1U) {
462 told_cnt->cnts.curobjs--;
463 told_cnt->cnts.curbytes -= old_usize;
464 }
465 if ((uintptr_t)cnt > (uintptr_t)1U) {
466 cnt->cnts.curobjs++;
467 cnt->cnts.curbytes += usize;
468 if (opt_prof_accum) {
469 cnt->cnts.accumobjs++;
470 cnt->cnts.accumbytes += usize;
471 }
472 }
473 /*********/
474 mb_write();
475 /*********/
476 if ((uintptr_t)told_cnt > (uintptr_t)1U)
477 told_cnt->epoch++;
478 if ((uintptr_t)cnt > (uintptr_t)1U)
479 cnt->epoch++;
480 /*********/
481 mb_write(); /* Not strictly necessary. */
482 }
483
484 JEMALLOC_INLINE void
prof_free(const void * ptr,size_t size)485 prof_free(const void *ptr, size_t size)
486 {
487 prof_ctx_t *ctx = prof_ctx_get(ptr);
488
489 cassert(config_prof);
490
491 if ((uintptr_t)ctx > (uintptr_t)1) {
492 prof_thr_cnt_t *tcnt;
493 assert(size == isalloc(ptr, true));
494 tcnt = prof_lookup(ctx->bt);
495
496 if (tcnt != NULL) {
497 tcnt->epoch++;
498 /*********/
499 mb_write();
500 /*********/
501 tcnt->cnts.curobjs--;
502 tcnt->cnts.curbytes -= size;
503 /*********/
504 mb_write();
505 /*********/
506 tcnt->epoch++;
507 /*********/
508 mb_write();
509 /*********/
510 } else {
511 /*
512 * OOM during free() cannot be propagated, so operate
513 * directly on cnt->ctx->cnt_merged.
514 */
515 malloc_mutex_lock(ctx->lock);
516 ctx->cnt_merged.curobjs--;
517 ctx->cnt_merged.curbytes -= size;
518 malloc_mutex_unlock(ctx->lock);
519 }
520 }
521 }
522 #endif
523
524 #endif /* JEMALLOC_H_INLINES */
525 /******************************************************************************/
526