1 /*
2
3 Perf trampoline instrumentation
4 ===============================
5
6 This file contains instrumentation to allow to associate
7 calls to the CPython eval loop back to the names of the Python
8 functions and filename being executed.
9
10 Many native performance profilers like the Linux perf tools are
11 only available to 'see' the C stack when sampling from the profiled
12 process. This means that if we have the following python code:
13
14 import time
15 def foo(n):
16 # Some CPU intensive code
17
18 def bar(n):
19 foo(n)
20
21 def baz(n):
22 bar(n)
23
24 baz(10000000)
25
26 A performance profiler that is only able to see native frames will
27 produce the following backtrace when sampling from foo():
28
29 _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
30 _PyEval_Vector
31 _PyFunction_Vectorcall
32 PyObject_Vectorcall
33 call_function
34
35 _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
36 _PyEval_EvalFrame
37 _PyEval_Vector
38 _PyFunction_Vectorcall
39 PyObject_Vectorcall
40 call_function
41
42 _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
43 _PyEval_EvalFrame
44 _PyEval_Vector
45 _PyFunction_Vectorcall
46 PyObject_Vectorcall
47 call_function
48
49 ...
50
51 Py_RunMain
52
53 Because the profiler is only able to see the native frames and the native
54 function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
55 then the profiler and any reporter generated by it will not be able to
56 associate the names of the Python functions and the filenames associated with
57 those calls, rendering the results useless in the Python world.
58
59 To fix this problem, we introduce the concept of a trampoline frame. A
60 trampoline frame is a piece of code that is unique per Python code object that
61 is executed before entering the CPython eval loop. This piece of code just
62 calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
63 forwards all the arguments received. In this way, when a profiler samples
64 frames from the previous example it will see;
65
66 _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
67 [Jit compiled code 3]
68 _PyEval_Vector
69 _PyFunction_Vectorcall
70 PyObject_Vectorcall
71 call_function
72
73 _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
74 [Jit compiled code 2]
75 _PyEval_EvalFrame
76 _PyEval_Vector
77 _PyFunction_Vectorcall
78 PyObject_Vectorcall
79 call_function
80
81 _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
82 [Jit compiled code 1]
83 _PyEval_EvalFrame
84 _PyEval_Vector
85 _PyFunction_Vectorcall
86 PyObject_Vectorcall
87 call_function
88
89 ...
90
91 Py_RunMain
92
93 When we generate every unique copy of the trampoline (what here we called "[Jit
94 compiled code N]") we write the relationship between the compiled code and the
95 Python function that is associated with it. Every profiler requires this
96 information in a different format. For example, the Linux "perf" profiler
97 requires a file in "/tmp/perf-PID.map" (name and location not configurable)
98 with the following format:
99
100 <compiled code address> <compiled code size> <name of the compiled code>
101
102 If this file is available when "perf" generates reports, it will automatically
103 associate every trampoline with the Python function that it is associated with
104 allowing it to generate reports that include Python information. These reports
105 then can also be filtered in a way that *only* Python information appears.
106
107 Notice that for this to work, there must be a unique copied of the trampoline
108 per Python code object even if the code in the trampoline is the same. To
109 achieve this we have a assembly template in Objects/asm_trampiline.S that is
110 compiled into the Python executable/shared library. This template generates a
111 symbol that maps the start of the assembly code and another that marks the end
112 of the assembly code for the trampoline. Then, every time we need a unique
113 trampoline for a Python code object, we copy the assembly code into a mmaped
114 area that has executable permissions and we return the start of that area as
115 our trampoline function.
116
117 Asking for a mmap-ed memory area for trampoline is very wasteful so we
118 allocate big arenas of memory in a single mmap call, we populate the entire
119 arena with copies of the trampoline (this allows us to now have to invalidate
120 the icache for the instructions in the page) and then we return the next
121 available chunk every time someone asks for a new trampoline. We keep a linked
122 list of arenas in case the current memory arena is exhausted and another one is
123 needed.
124
125 For the best results, Python should be compiled with
126 CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
127 profilers to unwind using only the frame pointer and not on DWARF debug
128 information (note that as trampilines are dynamically generated there won't be
129 any DWARF information available for them).
130 */
131
132 #include "Python.h"
133 #include "pycore_ceval.h" // _PyPerf_Callbacks
134 #include "pycore_frame.h"
135 #include "pycore_interp.h"
136
137
138 #ifdef PY_HAVE_PERF_TRAMPOLINE
139
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <sys/mman.h> // mmap()
144 #include <sys/types.h>
145 #include <unistd.h> // sysconf()
146 #include <sys/time.h> // gettimeofday()
147
148
149 #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
150 #define PY_HAVE_INVALIDATE_ICACHE
151
152 #if defined(__clang__) || defined(__GNUC__)
153 extern void __clear_cache(void *, void*);
154 #endif
155
invalidate_icache(char * begin,char * end)156 static void invalidate_icache(char* begin, char*end) {
157 #if defined(__clang__) || defined(__GNUC__)
158 return __clear_cache(begin, end);
159 #else
160 return;
161 #endif
162 }
163 #endif
164
165 /* The function pointer is passed as last argument. The other three arguments
166 * are passed in the same order as the function requires. This results in
167 * shorter, more efficient ASM code for trampoline.
168 */
169 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
170 int throwflag);
171 typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
172 py_evaluator);
173
174 extern void *_Py_trampoline_func_start; // Start of the template of the
175 // assembly trampoline
176 extern void *
177 _Py_trampoline_func_end; // End of the template of the assembly trampoline
178
179 struct code_arena_st {
180 char *start_addr; // Start of the memory arena
181 char *current_addr; // Address of the current trampoline within the arena
182 size_t size; // Size of the memory arena
183 size_t size_left; // Remaining size of the memory arena
184 size_t code_size; // Size of the code of every trampoline in the arena
185 struct code_arena_st
186 *prev; // Pointer to the arena or NULL if this is the first arena.
187 };
188
189 typedef struct code_arena_st code_arena_t;
190 typedef struct trampoline_api_st trampoline_api_t;
191
192 enum perf_trampoline_type {
193 PERF_TRAMPOLINE_UNSET = 0,
194 PERF_TRAMPOLINE_TYPE_MAP = 1,
195 PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
196 };
197
198 #define perf_status _PyRuntime.ceval.perf.status
199 #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
200 #define perf_code_arena _PyRuntime.ceval.perf.code_arena
201 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
202 #define perf_map_file _PyRuntime.ceval.perf.map_file
203 #define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
204 #define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
205
206 static void
perf_map_write_entry(void * state,const void * code_addr,unsigned int code_size,PyCodeObject * co)207 perf_map_write_entry(void *state, const void *code_addr,
208 unsigned int code_size, PyCodeObject *co)
209 {
210 const char *entry = "";
211 if (co->co_qualname != NULL) {
212 entry = PyUnicode_AsUTF8(co->co_qualname);
213 }
214 const char *filename = "";
215 if (co->co_filename != NULL) {
216 filename = PyUnicode_AsUTF8(co->co_filename);
217 }
218 size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
219 char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
220 if (perf_map_entry == NULL) {
221 return;
222 }
223 snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
224 PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
225 PyMem_RawFree(perf_map_entry);
226 }
227
228 static void*
perf_map_init_state(void)229 perf_map_init_state(void)
230 {
231 PyUnstable_PerfMapState_Init();
232 trampoline_api.code_padding = 0;
233 perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
234 return NULL;
235 }
236
237 static int
perf_map_free_state(void * state)238 perf_map_free_state(void *state)
239 {
240 PyUnstable_PerfMapState_Fini();
241 return 0;
242 }
243
244 _PyPerf_Callbacks _Py_perfmap_callbacks = {
245 &perf_map_init_state,
246 &perf_map_write_entry,
247 &perf_map_free_state,
248 };
249
250
round_up(int64_t value,int64_t multiple)251 static size_t round_up(int64_t value, int64_t multiple) {
252 if (multiple == 0) {
253 // Avoid division by zero
254 return value;
255 }
256
257 int64_t remainder = value % multiple;
258 if (remainder == 0) {
259 // Value is already a multiple of 'multiple'
260 return value;
261 }
262
263 // Calculate the difference to the next multiple
264 int64_t difference = multiple - remainder;
265
266 // Add the difference to the value
267 int64_t rounded_up_value = value + difference;
268
269 return rounded_up_value;
270 }
271
272 // TRAMPOLINE MANAGEMENT API
273
274 static int
new_code_arena(void)275 new_code_arena(void)
276 {
277 // non-trivial programs typically need 64 to 256 kiB.
278 size_t mem_size = 4096 * 16;
279 assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
280 char *memory =
281 mmap(NULL, // address
282 mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
283 -1, // fd (not used here)
284 0); // offset (not used here)
285 if (memory == MAP_FAILED) {
286 PyErr_SetFromErrno(PyExc_OSError);
287 PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
288 perf_status = PERF_STATUS_FAILED;
289 return -1;
290 }
291 void *start = &_Py_trampoline_func_start;
292 void *end = &_Py_trampoline_func_end;
293 size_t code_size = end - start;
294 size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
295 // TODO: Check the effect of alignment of the code chunks. Initial investigation
296 // showed that this has no effect on performance in x86-64 or aarch64 and the current
297 // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
298 //
299 // We should check the values in the future and see if there is a
300 // measurable performance improvement by rounding trampolines up to 32-bit
301 // or 64-bit alignment.
302
303 size_t n_copies = mem_size / chunk_size;
304 for (size_t i = 0; i < n_copies; i++) {
305 memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
306 }
307 // Some systems may prevent us from creating executable code on the fly.
308 int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
309 if (res == -1) {
310 PyErr_SetFromErrno(PyExc_OSError);
311 munmap(memory, mem_size);
312 PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
313 "PROT_READ | PROT_EXEC");
314 return -1;
315 }
316
317 #ifdef PY_HAVE_INVALIDATE_ICACHE
318 // Before the JIT can run a block of code that has been emitted it must invalidate
319 // the instruction cache on some platforms like arm and aarch64.
320 invalidate_icache(memory, memory + mem_size);
321 #endif
322
323 code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
324 if (new_arena == NULL) {
325 PyErr_NoMemory();
326 munmap(memory, mem_size);
327 PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
328 return -1;
329 }
330
331 new_arena->start_addr = memory;
332 new_arena->current_addr = memory;
333 new_arena->size = mem_size;
334 new_arena->size_left = mem_size;
335 new_arena->code_size = code_size;
336 new_arena->prev = perf_code_arena;
337 perf_code_arena = new_arena;
338 return 0;
339 }
340
341 static void
free_code_arenas(void)342 free_code_arenas(void)
343 {
344 code_arena_t *cur = perf_code_arena;
345 code_arena_t *prev;
346 perf_code_arena = NULL; // invalid static pointer
347 while (cur) {
348 munmap(cur->start_addr, cur->size);
349 prev = cur->prev;
350 PyMem_RawFree(cur);
351 cur = prev;
352 }
353 }
354
355 static inline py_trampoline
code_arena_new_code(code_arena_t * code_arena)356 code_arena_new_code(code_arena_t *code_arena)
357 {
358 py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
359 size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
360 code_arena->size_left -= total_code_size;
361 code_arena->current_addr += total_code_size;
362 return trampoline;
363 }
364
365 static inline py_trampoline
compile_trampoline(void)366 compile_trampoline(void)
367 {
368 size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
369 if ((perf_code_arena == NULL) ||
370 (perf_code_arena->size_left <= total_code_size)) {
371 if (new_code_arena() < 0) {
372 return NULL;
373 }
374 }
375 assert(perf_code_arena->size_left <= perf_code_arena->size);
376 return code_arena_new_code(perf_code_arena);
377 }
378
379 static PyObject *
py_trampoline_evaluator(PyThreadState * ts,_PyInterpreterFrame * frame,int throw)380 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
381 int throw)
382 {
383 if (perf_status == PERF_STATUS_FAILED ||
384 perf_status == PERF_STATUS_NO_INIT) {
385 goto default_eval;
386 }
387 PyCodeObject *co = _PyFrame_GetCode(frame);
388 py_trampoline f = NULL;
389 assert(extra_code_index != -1);
390 int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
391 if (ret != 0 || f == NULL) {
392 // This is the first time we see this code object so we need
393 // to compile a trampoline for it.
394 py_trampoline new_trampoline = compile_trampoline();
395 if (new_trampoline == NULL) {
396 goto default_eval;
397 }
398 trampoline_api.write_state(trampoline_api.state, new_trampoline,
399 perf_code_arena->code_size, co);
400 _PyCode_SetExtra((PyObject *)co, extra_code_index,
401 (void *)new_trampoline);
402 f = new_trampoline;
403 }
404 assert(f != NULL);
405 return f(ts, frame, throw, _PyEval_EvalFrameDefault);
406 default_eval:
407 // Something failed, fall back to the default evaluator.
408 return _PyEval_EvalFrameDefault(ts, frame, throw);
409 }
410 #endif // PY_HAVE_PERF_TRAMPOLINE
411
PyUnstable_PerfTrampoline_CompileCode(PyCodeObject * co)412 int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
413 {
414 #ifdef PY_HAVE_PERF_TRAMPOLINE
415 py_trampoline f = NULL;
416 assert(extra_code_index != -1);
417 int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
418 if (ret != 0 || f == NULL) {
419 py_trampoline new_trampoline = compile_trampoline();
420 if (new_trampoline == NULL) {
421 return 0;
422 }
423 trampoline_api.write_state(trampoline_api.state, new_trampoline,
424 perf_code_arena->code_size, co);
425 return _PyCode_SetExtra((PyObject *)co, extra_code_index,
426 (void *)new_trampoline);
427 }
428 #endif // PY_HAVE_PERF_TRAMPOLINE
429 return 0;
430 }
431
432 int
_PyIsPerfTrampolineActive(void)433 _PyIsPerfTrampolineActive(void)
434 {
435 #ifdef PY_HAVE_PERF_TRAMPOLINE
436 PyThreadState *tstate = _PyThreadState_GET();
437 return tstate->interp->eval_frame == py_trampoline_evaluator;
438 #endif
439 return 0;
440 }
441
442 void
_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks * callbacks)443 _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
444 {
445 if (callbacks == NULL) {
446 return;
447 }
448 #ifdef PY_HAVE_PERF_TRAMPOLINE
449 callbacks->init_state = trampoline_api.init_state;
450 callbacks->write_state = trampoline_api.write_state;
451 callbacks->free_state = trampoline_api.free_state;
452 #endif
453 return;
454 }
455
456 int
_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks * callbacks)457 _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
458 {
459 if (callbacks == NULL) {
460 return -1;
461 }
462 #ifdef PY_HAVE_PERF_TRAMPOLINE
463 if (trampoline_api.state) {
464 _PyPerfTrampoline_Fini();
465 }
466 trampoline_api.init_state = callbacks->init_state;
467 trampoline_api.write_state = callbacks->write_state;
468 trampoline_api.free_state = callbacks->free_state;
469 trampoline_api.state = NULL;
470 #endif
471 return 0;
472 }
473
474 int
_PyPerfTrampoline_Init(int activate)475 _PyPerfTrampoline_Init(int activate)
476 {
477 #ifdef PY_HAVE_PERF_TRAMPOLINE
478 PyThreadState *tstate = _PyThreadState_GET();
479 if (tstate->interp->eval_frame &&
480 tstate->interp->eval_frame != py_trampoline_evaluator) {
481 PyErr_SetString(PyExc_RuntimeError,
482 "Trampoline cannot be initialized as a custom eval "
483 "frame is already present");
484 return -1;
485 }
486 if (!activate) {
487 tstate->interp->eval_frame = NULL;
488 perf_status = PERF_STATUS_NO_INIT;
489 }
490 else {
491 tstate->interp->eval_frame = py_trampoline_evaluator;
492 if (new_code_arena() < 0) {
493 return -1;
494 }
495 extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
496 if (extra_code_index == -1) {
497 return -1;
498 }
499 if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
500 trampoline_api.state = trampoline_api.init_state();
501 }
502 perf_status = PERF_STATUS_OK;
503 }
504 #endif
505 return 0;
506 }
507
508 int
_PyPerfTrampoline_Fini(void)509 _PyPerfTrampoline_Fini(void)
510 {
511 #ifdef PY_HAVE_PERF_TRAMPOLINE
512 if (perf_status != PERF_STATUS_OK) {
513 return 0;
514 }
515 PyThreadState *tstate = _PyThreadState_GET();
516 if (tstate->interp->eval_frame == py_trampoline_evaluator) {
517 tstate->interp->eval_frame = NULL;
518 }
519 if (perf_status == PERF_STATUS_OK) {
520 trampoline_api.free_state(trampoline_api.state);
521 perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
522 }
523 extra_code_index = -1;
524 perf_status = PERF_STATUS_NO_INIT;
525 #endif
526 return 0;
527 }
528
_PyPerfTrampoline_FreeArenas(void)529 void _PyPerfTrampoline_FreeArenas(void) {
530 #ifdef PY_HAVE_PERF_TRAMPOLINE
531 free_code_arenas();
532 #endif
533 return;
534 }
535
536 int
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable)537 PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
538 #ifdef PY_HAVE_PERF_TRAMPOLINE
539 persist_after_fork = enable;
540 return persist_after_fork;
541 #endif
542 return 0;
543 }
544
545 PyStatus
_PyPerfTrampoline_AfterFork_Child(void)546 _PyPerfTrampoline_AfterFork_Child(void)
547 {
548 #ifdef PY_HAVE_PERF_TRAMPOLINE
549 if (persist_after_fork) {
550 if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
551 return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
552 }
553 _PyPerfTrampoline_Fini();
554 char filename[256];
555 pid_t parent_pid = getppid();
556 snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
557 if (PyUnstable_CopyPerfMapFile(filename) != 0) {
558 return PyStatus_Error("Failed to copy perf map file.");
559 }
560 } else {
561 // Restart trampoline in file in child.
562 int was_active = _PyIsPerfTrampolineActive();
563 _PyPerfTrampoline_Fini();
564 if (was_active) {
565 _PyPerfTrampoline_Init(1);
566 }
567 }
568 #endif
569 return PyStatus_Ok();
570 }
571