• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Perf trampoline instrumentation
4 ===============================
5 
6 This file contains instrumentation to allow to associate
7 calls to the CPython eval loop back to the names of the Python
8 functions and filename being executed.
9 
10 Many native performance profilers like the Linux perf tools are
11 only available to 'see' the C stack when sampling from the profiled
12 process. This means that if we have the following python code:
13 
14     import time
15     def foo(n):
16         # Some CPU intensive code
17 
18     def bar(n):
19         foo(n)
20 
21     def baz(n):
22         bar(n)
23 
24     baz(10000000)
25 
26 A performance profiler that is only able to see native frames will
27 produce the following backtrace when sampling from foo():
28 
29     _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
30     _PyEval_Vector
31     _PyFunction_Vectorcall
32     PyObject_Vectorcall
33     call_function
34 
35     _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
36     _PyEval_EvalFrame
37     _PyEval_Vector
38     _PyFunction_Vectorcall
39     PyObject_Vectorcall
40     call_function
41 
42     _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
43     _PyEval_EvalFrame
44     _PyEval_Vector
45     _PyFunction_Vectorcall
46     PyObject_Vectorcall
47     call_function
48 
49     ...
50 
51     Py_RunMain
52 
53 Because the profiler is only able to see the native frames and the native
54 function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
55 then the profiler and any reporter generated by it will not be able to
56 associate the names of the Python functions and the filenames associated with
57 those calls, rendering the results useless in the Python world.
58 
59 To fix this problem, we introduce the concept of a trampoline frame. A
60 trampoline frame is a piece of code that is unique per Python code object that
61 is executed before entering the CPython eval loop. This piece of code just
62 calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
63 forwards all the arguments received. In this way, when a profiler samples
64 frames from the previous example it will see;
65 
66     _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
67     [Jit compiled code 3]
68     _PyEval_Vector
69     _PyFunction_Vectorcall
70     PyObject_Vectorcall
71     call_function
72 
73     _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
74     [Jit compiled code 2]
75     _PyEval_EvalFrame
76     _PyEval_Vector
77     _PyFunction_Vectorcall
78     PyObject_Vectorcall
79     call_function
80 
81     _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
82     [Jit compiled code 1]
83     _PyEval_EvalFrame
84     _PyEval_Vector
85     _PyFunction_Vectorcall
86     PyObject_Vectorcall
87     call_function
88 
89     ...
90 
91     Py_RunMain
92 
93 When we generate every unique copy of the trampoline (what here we called "[Jit
94 compiled code N]") we write the relationship between the compiled code and the
95 Python function that is associated with it. Every profiler requires this
96 information in a different format. For example, the Linux "perf" profiler
97 requires a file in "/tmp/perf-PID.map" (name and location not configurable)
98 with the following format:
99 
100     <compiled code address> <compiled code size> <name of the compiled code>
101 
102 If this file is available when "perf" generates reports, it will automatically
103 associate every trampoline with the Python function that it is associated with
104 allowing it to generate reports that include Python information. These reports
105 then can also be filtered in a way that *only* Python information appears.
106 
107 Notice that for this to work, there must be a unique copied of the trampoline
108 per Python code object even if the code in the trampoline is the same. To
109 achieve this we have a assembly template in Objects/asm_trampiline.S that is
110 compiled into the Python executable/shared library. This template generates a
111 symbol that maps the start of the assembly code and another that marks the end
112 of the assembly code for the trampoline.  Then, every time we need a unique
113 trampoline for a Python code object, we copy the assembly code into a mmaped
114 area that has executable permissions and we return the start of that area as
115 our trampoline function.
116 
117 Asking for a mmap-ed memory area for trampoline is very wasteful so we
118 allocate big arenas of memory in a single mmap call, we populate the entire
119 arena with copies of the trampoline (this allows us to now have to invalidate
120 the icache for the instructions in the page) and then we return the next
121 available chunk every time someone asks for a new trampoline. We keep a linked
122 list of arenas in case the current memory arena is exhausted and another one is
123 needed.
124 
125 For the best results, Python should be compiled with
126 CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
127 profilers to unwind using only the frame pointer and not on DWARF debug
128 information (note that as trampilines are dynamically generated there won't be
129 any DWARF information available for them).
130 */
131 
132 #include "Python.h"
133 #include "pycore_ceval.h"         // _PyPerf_Callbacks
134 #include "pycore_frame.h"
135 #include "pycore_interp.h"
136 
137 
138 #ifdef PY_HAVE_PERF_TRAMPOLINE
139 
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <sys/mman.h>             // mmap()
144 #include <sys/types.h>
145 #include <unistd.h>               // sysconf()
146 #include <sys/time.h>           // gettimeofday()
147 
148 
149 #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
150 #define PY_HAVE_INVALIDATE_ICACHE
151 
152 #if defined(__clang__) || defined(__GNUC__)
153 extern void __clear_cache(void *, void*);
154 #endif
155 
invalidate_icache(char * begin,char * end)156 static void invalidate_icache(char* begin, char*end) {
157 #if defined(__clang__) || defined(__GNUC__)
158     return __clear_cache(begin, end);
159 #else
160     return;
161 #endif
162 }
163 #endif
164 
165 /* The function pointer is passed as last argument. The other three arguments
166  * are passed in the same order as the function requires. This results in
167  * shorter, more efficient ASM code for trampoline.
168  */
169 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
170                                   int throwflag);
171 typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
172                                    py_evaluator);
173 
174 extern void *_Py_trampoline_func_start;  // Start of the template of the
175                                          // assembly trampoline
176 extern void *
177     _Py_trampoline_func_end;  // End of the template of the assembly trampoline
178 
179 struct code_arena_st {
180     char *start_addr;    // Start of the memory arena
181     char *current_addr;  // Address of the current trampoline within the arena
182     size_t size;         // Size of the memory arena
183     size_t size_left;    // Remaining size of the memory arena
184     size_t code_size;    // Size of the code of every trampoline in the arena
185     struct code_arena_st
186         *prev;  // Pointer to the arena  or NULL if this is the first arena.
187 };
188 
189 typedef struct code_arena_st code_arena_t;
190 typedef struct trampoline_api_st trampoline_api_t;
191 
192 enum perf_trampoline_type {
193     PERF_TRAMPOLINE_UNSET = 0,
194     PERF_TRAMPOLINE_TYPE_MAP = 1,
195     PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
196 };
197 
198 #define perf_status _PyRuntime.ceval.perf.status
199 #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
200 #define perf_code_arena _PyRuntime.ceval.perf.code_arena
201 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
202 #define perf_map_file _PyRuntime.ceval.perf.map_file
203 #define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
204 #define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
205 
206 static void
perf_map_write_entry(void * state,const void * code_addr,unsigned int code_size,PyCodeObject * co)207 perf_map_write_entry(void *state, const void *code_addr,
208                          unsigned int code_size, PyCodeObject *co)
209 {
210     const char *entry = "";
211     if (co->co_qualname != NULL) {
212         entry = PyUnicode_AsUTF8(co->co_qualname);
213     }
214     const char *filename = "";
215     if (co->co_filename != NULL) {
216         filename = PyUnicode_AsUTF8(co->co_filename);
217     }
218     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
219     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
220     if (perf_map_entry == NULL) {
221         return;
222     }
223     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
224     PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
225     PyMem_RawFree(perf_map_entry);
226 }
227 
228 static void*
perf_map_init_state(void)229 perf_map_init_state(void)
230 {
231     PyUnstable_PerfMapState_Init();
232     trampoline_api.code_padding = 0;
233     perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
234     return NULL;
235 }
236 
237 static int
perf_map_free_state(void * state)238 perf_map_free_state(void *state)
239 {
240     PyUnstable_PerfMapState_Fini();
241     return 0;
242 }
243 
244 _PyPerf_Callbacks _Py_perfmap_callbacks = {
245     &perf_map_init_state,
246     &perf_map_write_entry,
247     &perf_map_free_state,
248 };
249 
250 
round_up(int64_t value,int64_t multiple)251 static size_t round_up(int64_t value, int64_t multiple) {
252     if (multiple == 0) {
253         // Avoid division by zero
254         return value;
255     }
256 
257     int64_t remainder = value % multiple;
258     if (remainder == 0) {
259         // Value is already a multiple of 'multiple'
260         return value;
261     }
262 
263     // Calculate the difference to the next multiple
264     int64_t difference = multiple - remainder;
265 
266     // Add the difference to the value
267     int64_t rounded_up_value = value + difference;
268 
269     return rounded_up_value;
270 }
271 
272 // TRAMPOLINE MANAGEMENT API
273 
274 static int
new_code_arena(void)275 new_code_arena(void)
276 {
277     // non-trivial programs typically need 64 to 256 kiB.
278     size_t mem_size = 4096 * 16;
279     assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
280     char *memory =
281         mmap(NULL,  // address
282              mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
283              -1,  // fd (not used here)
284              0);  // offset (not used here)
285     if (memory == MAP_FAILED) {
286         PyErr_SetFromErrno(PyExc_OSError);
287         PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
288         perf_status = PERF_STATUS_FAILED;
289         return -1;
290     }
291     void *start = &_Py_trampoline_func_start;
292     void *end = &_Py_trampoline_func_end;
293     size_t code_size = end - start;
294     size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
295     // TODO: Check the effect of alignment of the code chunks. Initial investigation
296     // showed that this has no effect on performance in x86-64 or aarch64 and the current
297     // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
298     //
299     // We should check the values in the future and see if there is a
300     // measurable performance improvement by rounding trampolines up to 32-bit
301     // or 64-bit alignment.
302 
303     size_t n_copies = mem_size / chunk_size;
304     for (size_t i = 0; i < n_copies; i++) {
305         memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
306     }
307     // Some systems may prevent us from creating executable code on the fly.
308     int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
309     if (res == -1) {
310         PyErr_SetFromErrno(PyExc_OSError);
311         munmap(memory, mem_size);
312         PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
313                                "PROT_READ | PROT_EXEC");
314         return -1;
315     }
316 
317 #ifdef PY_HAVE_INVALIDATE_ICACHE
318     // Before the JIT can run a block of code that has been emitted it must invalidate
319     // the instruction cache on some platforms like arm and aarch64.
320     invalidate_icache(memory, memory + mem_size);
321 #endif
322 
323     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
324     if (new_arena == NULL) {
325         PyErr_NoMemory();
326         munmap(memory, mem_size);
327         PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
328         return -1;
329     }
330 
331     new_arena->start_addr = memory;
332     new_arena->current_addr = memory;
333     new_arena->size = mem_size;
334     new_arena->size_left = mem_size;
335     new_arena->code_size = code_size;
336     new_arena->prev = perf_code_arena;
337     perf_code_arena = new_arena;
338     return 0;
339 }
340 
341 static void
free_code_arenas(void)342 free_code_arenas(void)
343 {
344     code_arena_t *cur = perf_code_arena;
345     code_arena_t *prev;
346     perf_code_arena = NULL;  // invalid static pointer
347     while (cur) {
348         munmap(cur->start_addr, cur->size);
349         prev = cur->prev;
350         PyMem_RawFree(cur);
351         cur = prev;
352     }
353 }
354 
355 static inline py_trampoline
code_arena_new_code(code_arena_t * code_arena)356 code_arena_new_code(code_arena_t *code_arena)
357 {
358     py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
359     size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
360     code_arena->size_left -= total_code_size;
361     code_arena->current_addr += total_code_size;
362     return trampoline;
363 }
364 
365 static inline py_trampoline
compile_trampoline(void)366 compile_trampoline(void)
367 {
368     size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
369     if ((perf_code_arena == NULL) ||
370         (perf_code_arena->size_left <= total_code_size)) {
371         if (new_code_arena() < 0) {
372             return NULL;
373         }
374     }
375     assert(perf_code_arena->size_left <= perf_code_arena->size);
376     return code_arena_new_code(perf_code_arena);
377 }
378 
379 static PyObject *
py_trampoline_evaluator(PyThreadState * ts,_PyInterpreterFrame * frame,int throw)380 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
381                         int throw)
382 {
383     if (perf_status == PERF_STATUS_FAILED ||
384         perf_status == PERF_STATUS_NO_INIT) {
385         goto default_eval;
386     }
387     PyCodeObject *co = _PyFrame_GetCode(frame);
388     py_trampoline f = NULL;
389     assert(extra_code_index != -1);
390     int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
391     if (ret != 0 || f == NULL) {
392         // This is the first time we see this code object so we need
393         // to compile a trampoline for it.
394         py_trampoline new_trampoline = compile_trampoline();
395         if (new_trampoline == NULL) {
396             goto default_eval;
397         }
398         trampoline_api.write_state(trampoline_api.state, new_trampoline,
399                                    perf_code_arena->code_size, co);
400         _PyCode_SetExtra((PyObject *)co, extra_code_index,
401                          (void *)new_trampoline);
402         f = new_trampoline;
403     }
404     assert(f != NULL);
405     return f(ts, frame, throw, _PyEval_EvalFrameDefault);
406 default_eval:
407     // Something failed, fall back to the default evaluator.
408     return _PyEval_EvalFrameDefault(ts, frame, throw);
409 }
410 #endif  // PY_HAVE_PERF_TRAMPOLINE
411 
PyUnstable_PerfTrampoline_CompileCode(PyCodeObject * co)412 int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
413 {
414 #ifdef PY_HAVE_PERF_TRAMPOLINE
415     py_trampoline f = NULL;
416     assert(extra_code_index != -1);
417     int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
418     if (ret != 0 || f == NULL) {
419         py_trampoline new_trampoline = compile_trampoline();
420         if (new_trampoline == NULL) {
421             return 0;
422         }
423         trampoline_api.write_state(trampoline_api.state, new_trampoline,
424                                    perf_code_arena->code_size, co);
425         return _PyCode_SetExtra((PyObject *)co, extra_code_index,
426                          (void *)new_trampoline);
427     }
428 #endif // PY_HAVE_PERF_TRAMPOLINE
429     return 0;
430 }
431 
432 int
_PyIsPerfTrampolineActive(void)433 _PyIsPerfTrampolineActive(void)
434 {
435 #ifdef PY_HAVE_PERF_TRAMPOLINE
436     PyThreadState *tstate = _PyThreadState_GET();
437     return tstate->interp->eval_frame == py_trampoline_evaluator;
438 #endif
439     return 0;
440 }
441 
442 void
_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks * callbacks)443 _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
444 {
445     if (callbacks == NULL) {
446         return;
447     }
448 #ifdef PY_HAVE_PERF_TRAMPOLINE
449     callbacks->init_state = trampoline_api.init_state;
450     callbacks->write_state = trampoline_api.write_state;
451     callbacks->free_state = trampoline_api.free_state;
452 #endif
453     return;
454 }
455 
456 int
_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks * callbacks)457 _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
458 {
459     if (callbacks == NULL) {
460         return -1;
461     }
462 #ifdef PY_HAVE_PERF_TRAMPOLINE
463     if (trampoline_api.state) {
464         _PyPerfTrampoline_Fini();
465     }
466     trampoline_api.init_state = callbacks->init_state;
467     trampoline_api.write_state = callbacks->write_state;
468     trampoline_api.free_state = callbacks->free_state;
469     trampoline_api.state = NULL;
470 #endif
471     return 0;
472 }
473 
474 int
_PyPerfTrampoline_Init(int activate)475 _PyPerfTrampoline_Init(int activate)
476 {
477 #ifdef PY_HAVE_PERF_TRAMPOLINE
478     PyThreadState *tstate = _PyThreadState_GET();
479     if (tstate->interp->eval_frame &&
480         tstate->interp->eval_frame != py_trampoline_evaluator) {
481         PyErr_SetString(PyExc_RuntimeError,
482                         "Trampoline cannot be initialized as a custom eval "
483                         "frame is already present");
484         return -1;
485     }
486     if (!activate) {
487         tstate->interp->eval_frame = NULL;
488         perf_status = PERF_STATUS_NO_INIT;
489     }
490     else {
491         tstate->interp->eval_frame = py_trampoline_evaluator;
492         if (new_code_arena() < 0) {
493             return -1;
494         }
495         extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
496         if (extra_code_index == -1) {
497             return -1;
498         }
499         if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
500             trampoline_api.state = trampoline_api.init_state();
501         }
502         perf_status = PERF_STATUS_OK;
503     }
504 #endif
505     return 0;
506 }
507 
508 int
_PyPerfTrampoline_Fini(void)509 _PyPerfTrampoline_Fini(void)
510 {
511 #ifdef PY_HAVE_PERF_TRAMPOLINE
512     if (perf_status != PERF_STATUS_OK) {
513         return 0;
514     }
515     PyThreadState *tstate = _PyThreadState_GET();
516     if (tstate->interp->eval_frame == py_trampoline_evaluator) {
517         tstate->interp->eval_frame = NULL;
518     }
519     if (perf_status == PERF_STATUS_OK) {
520         trampoline_api.free_state(trampoline_api.state);
521         perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
522     }
523     extra_code_index = -1;
524     perf_status = PERF_STATUS_NO_INIT;
525 #endif
526     return 0;
527 }
528 
_PyPerfTrampoline_FreeArenas(void)529 void _PyPerfTrampoline_FreeArenas(void) {
530 #ifdef PY_HAVE_PERF_TRAMPOLINE
531     free_code_arenas();
532 #endif
533     return;
534 }
535 
536 int
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable)537 PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
538 #ifdef PY_HAVE_PERF_TRAMPOLINE
539     persist_after_fork = enable;
540     return persist_after_fork;
541 #endif
542     return 0;
543 }
544 
545 PyStatus
_PyPerfTrampoline_AfterFork_Child(void)546 _PyPerfTrampoline_AfterFork_Child(void)
547 {
548 #ifdef PY_HAVE_PERF_TRAMPOLINE
549     if (persist_after_fork) {
550         if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
551             return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
552         }
553         _PyPerfTrampoline_Fini();
554         char filename[256];
555         pid_t parent_pid = getppid();
556         snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
557         if (PyUnstable_CopyPerfMapFile(filename) != 0) {
558             return PyStatus_Error("Failed to copy perf map file.");
559         }
560     } else {
561         // Restart trampoline in file in child.
562         int was_active = _PyIsPerfTrampolineActive();
563         _PyPerfTrampoline_Fini();
564         if (was_active) {
565             _PyPerfTrampoline_Init(1);
566         }
567     }
568 #endif
569     return PyStatus_Ok();
570 }
571