/src/cpython/Python/perf_trampoline.c
Line | Count | Source |
1 | | /* |
2 | | |
3 | | Perf trampoline instrumentation |
4 | | =============================== |
5 | | |
6 | | This file contains instrumentation to allow to associate |
7 | | calls to the CPython eval loop back to the names of the Python |
8 | | functions and filename being executed. |
9 | | |
10 | | Many native performance profilers like the Linux perf tools are |
11 | | only available to 'see' the C stack when sampling from the profiled |
12 | | process. This means that if we have the following python code: |
13 | | |
14 | | import time |
15 | | def foo(n): |
16 | | # Some CPU intensive code |
17 | | |
18 | | def bar(n): |
19 | | foo(n) |
20 | | |
21 | | def baz(n): |
22 | | bar(n) |
23 | | |
24 | | baz(10000000) |
25 | | |
26 | | A performance profiler that is only able to see native frames will |
27 | | produce the following backtrace when sampling from foo(): |
28 | | |
29 | | _PyEval_EvalFrameDefault -----> Evaluation frame of foo() |
30 | | _PyEval_Vector |
31 | | _PyFunction_Vectorcall |
32 | | PyObject_Vectorcall |
33 | | call_function |
34 | | |
35 | | _PyEval_EvalFrameDefault ------> Evaluation frame of bar() |
36 | | _PyEval_EvalFrame |
37 | | _PyEval_Vector |
38 | | _PyFunction_Vectorcall |
39 | | PyObject_Vectorcall |
40 | | call_function |
41 | | |
42 | | _PyEval_EvalFrameDefault -------> Evaluation frame of baz() |
43 | | _PyEval_EvalFrame |
44 | | _PyEval_Vector |
45 | | _PyFunction_Vectorcall |
46 | | PyObject_Vectorcall |
47 | | call_function |
48 | | |
49 | | ... |
50 | | |
51 | | Py_RunMain |
52 | | |
53 | | Because the profiler is only able to see the native frames and the native |
54 | | function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) |
55 | | then the profiler and any reporter generated by it will not be able to |
56 | | associate the names of the Python functions and the filenames associated with |
57 | | those calls, rendering the results useless in the Python world. |
58 | | |
59 | | To fix this problem, we introduce the concept of a trampoline frame. A |
60 | | trampoline frame is a piece of code that is unique per Python code object that |
61 | | is executed before entering the CPython eval loop. This piece of code just |
62 | | calls the original Python evaluation function (_PyEval_EvalFrameDefault) and |
63 | | forwards all the arguments received. In this way, when a profiler samples |
64 | | frames from the previous example it will see; |
65 | | |
66 | | _PyEval_EvalFrameDefault -----> Evaluation frame of foo() |
67 | | [Jit compiled code 3] |
68 | | _PyEval_Vector |
69 | | _PyFunction_Vectorcall |
70 | | PyObject_Vectorcall |
71 | | call_function |
72 | | |
73 | | _PyEval_EvalFrameDefault ------> Evaluation frame of bar() |
74 | | [Jit compiled code 2] |
75 | | _PyEval_EvalFrame |
76 | | _PyEval_Vector |
77 | | _PyFunction_Vectorcall |
78 | | PyObject_Vectorcall |
79 | | call_function |
80 | | |
81 | | _PyEval_EvalFrameDefault -------> Evaluation frame of baz() |
82 | | [Jit compiled code 1] |
83 | | _PyEval_EvalFrame |
84 | | _PyEval_Vector |
85 | | _PyFunction_Vectorcall |
86 | | PyObject_Vectorcall |
87 | | call_function |
88 | | |
89 | | ... |
90 | | |
91 | | Py_RunMain |
92 | | |
93 | | When we generate every unique copy of the trampoline (what here we called "[Jit |
94 | | compiled code N]") we write the relationship between the compiled code and the |
95 | | Python function that is associated with it. Every profiler requires this |
96 | | information in a different format. For example, the Linux "perf" profiler |
97 | | requires a file in "/tmp/perf-PID.map" (name and location not configurable) |
98 | | with the following format: |
99 | | |
100 | | <compiled code address> <compiled code size> <name of the compiled code> |
101 | | |
102 | | If this file is available when "perf" generates reports, it will automatically |
103 | | associate every trampoline with the Python function that it is associated with |
104 | | allowing it to generate reports that include Python information. These reports |
105 | | then can also be filtered in a way that *only* Python information appears. |
106 | | |
107 | | Notice that for this to work, there must be a unique copied of the trampoline |
108 | | per Python code object even if the code in the trampoline is the same. To |
109 | | achieve this we have a assembly template in Objects/asm_trampiline.S that is |
110 | | compiled into the Python executable/shared library. This template generates a |
111 | | symbol that maps the start of the assembly code and another that marks the end |
112 | | of the assembly code for the trampoline. Then, every time we need a unique |
113 | | trampoline for a Python code object, we copy the assembly code into a mmaped |
114 | | area that has executable permissions and we return the start of that area as |
115 | | our trampoline function. |
116 | | |
117 | | Asking for a mmap-ed memory area for trampoline is very wasteful so we |
118 | | allocate big arenas of memory in a single mmap call, we populate the entire |
119 | | arena with copies of the trampoline (this allows us to now have to invalidate |
120 | | the icache for the instructions in the page) and then we return the next |
121 | | available chunk every time someone asks for a new trampoline. We keep a linked |
122 | | list of arenas in case the current memory arena is exhausted and another one is |
123 | | needed. |
124 | | |
125 | | For the best results, Python should be compiled with |
126 | | CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows |
127 | | profilers to unwind using only the frame pointer and not on DWARF debug |
128 | | information (note that as trampilines are dynamically generated there won't be |
129 | | any DWARF information available for them). |
130 | | */ |
131 | | |
132 | | #include "Python.h" |
133 | | #include "pycore_ceval.h" // _PyPerf_Callbacks |
134 | | #include "pycore_interpframe.h" // _PyFrame_GetCode() |
135 | | #include "pycore_mmap.h" // _PyAnnotateMemoryMap() |
136 | | #include "pycore_runtime.h" // _PyRuntime |
137 | | |
138 | | |
139 | | #ifdef PY_HAVE_PERF_TRAMPOLINE |
140 | | |
141 | | #include <fcntl.h> |
142 | | #include <stdio.h> |
143 | | #include <stdlib.h> |
144 | | #include <sys/mman.h> // mmap() |
145 | | #include <sys/types.h> |
146 | | #include <unistd.h> // sysconf() |
147 | | #include <sys/time.h> // gettimeofday() |
148 | | |
149 | | |
150 | | #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) |
151 | | #define PY_HAVE_INVALIDATE_ICACHE |
152 | | |
153 | | #if defined(__clang__) || defined(__GNUC__) |
154 | | extern void __clear_cache(void *, void*); |
155 | | #endif |
156 | | |
157 | | static void invalidate_icache(char* begin, char*end) { |
158 | | #if defined(__clang__) || defined(__GNUC__) |
159 | | return __clear_cache(begin, end); |
160 | | #else |
161 | | return; |
162 | | #endif |
163 | | } |
164 | | #endif |
165 | | |
166 | | /* The function pointer is passed as last argument. The other three arguments |
167 | | * are passed in the same order as the function requires. This results in |
168 | | * shorter, more efficient ASM code for trampoline. |
169 | | */ |
170 | | typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, |
171 | | int throwflag); |
172 | | typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, |
173 | | py_evaluator); |
174 | | |
175 | | extern void *_Py_trampoline_func_start; // Start of the template of the |
176 | | // assembly trampoline |
177 | | extern void * |
178 | | _Py_trampoline_func_end; // End of the template of the assembly trampoline |
179 | | |
180 | | struct code_arena_st { |
181 | | char *start_addr; // Start of the memory arena |
182 | | char *current_addr; // Address of the current trampoline within the arena |
183 | | size_t size; // Size of the memory arena |
184 | | size_t size_left; // Remaining size of the memory arena |
185 | | size_t code_size; // Size of the code of every trampoline in the arena |
186 | | struct code_arena_st |
187 | | *prev; // Pointer to the arena or NULL if this is the first arena. |
188 | | }; |
189 | | |
190 | | typedef struct code_arena_st code_arena_t; |
191 | | typedef struct trampoline_api_st trampoline_api_t; |
192 | | |
193 | | enum perf_trampoline_type { |
194 | | PERF_TRAMPOLINE_UNSET = 0, |
195 | | PERF_TRAMPOLINE_TYPE_MAP = 1, |
196 | | PERF_TRAMPOLINE_TYPE_JITDUMP = 2, |
197 | | }; |
198 | | |
199 | 0 | #define perf_status _PyRuntime.ceval.perf.status |
200 | 0 | #define extra_code_index _PyRuntime.ceval.perf.extra_code_index |
201 | 0 | #define perf_code_arena _PyRuntime.ceval.perf.code_arena |
202 | 0 | #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
203 | | #define perf_map_file _PyRuntime.ceval.perf.map_file |
204 | 0 | #define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork |
205 | 0 | #define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type |
206 | 0 | #define prev_eval_frame _PyRuntime.ceval.perf.prev_eval_frame |
207 | 0 | #define trampoline_refcount _PyRuntime.ceval.perf.trampoline_refcount |
208 | 0 | #define code_watcher_id _PyRuntime.ceval.perf.code_watcher_id |
209 | | |
210 | | static void free_code_arenas(void); |
211 | | |
212 | | static void |
213 | | perf_trampoline_reset_state(void) |
214 | 0 | { |
215 | 0 | free_code_arenas(); |
216 | 0 | if (code_watcher_id >= 0) { |
217 | 0 | PyCode_ClearWatcher(code_watcher_id); |
218 | 0 | code_watcher_id = -1; |
219 | 0 | } |
220 | 0 | extra_code_index = -1; |
221 | 0 | } |
222 | | |
223 | | static int |
224 | | perf_trampoline_code_watcher(PyCodeEvent event, PyCodeObject *co) |
225 | 0 | { |
226 | 0 | if (event != PY_CODE_EVENT_DESTROY) { |
227 | 0 | return 0; |
228 | 0 | } |
229 | 0 | if (extra_code_index == -1) { |
230 | 0 | return 0; |
231 | 0 | } |
232 | 0 | py_trampoline f = NULL; |
233 | 0 | int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); |
234 | 0 | if (ret != 0 || f == NULL) { |
235 | 0 | return 0; |
236 | 0 | } |
237 | 0 | trampoline_refcount--; |
238 | 0 | if (trampoline_refcount == 0) { |
239 | 0 | perf_trampoline_reset_state(); |
240 | 0 | } |
241 | 0 | return 0; |
242 | 0 | } |
243 | | |
244 | | static void |
245 | | perf_map_write_entry(void *state, const void *code_addr, |
246 | | unsigned int code_size, PyCodeObject *co) |
247 | 0 | { |
248 | 0 | const char *entry = ""; |
249 | 0 | if (co->co_qualname != NULL) { |
250 | 0 | entry = PyUnicode_AsUTF8(co->co_qualname); |
251 | 0 | } |
252 | 0 | const char *filename = ""; |
253 | 0 | if (co->co_filename != NULL) { |
254 | 0 | filename = PyUnicode_AsUTF8(co->co_filename); |
255 | 0 | } |
256 | 0 | size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
257 | 0 | char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
258 | 0 | if (perf_map_entry == NULL) { |
259 | 0 | return; |
260 | 0 | } |
261 | 0 | snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
262 | 0 | PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry); |
263 | 0 | PyMem_RawFree(perf_map_entry); |
264 | 0 | } |
265 | | |
266 | | static void* |
267 | | perf_map_init_state(void) |
268 | 0 | { |
269 | 0 | PyUnstable_PerfMapState_Init(); |
270 | 0 | trampoline_api.code_padding = 0; |
271 | 0 | trampoline_api.code_alignment = 32; |
272 | 0 | perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP; |
273 | 0 | return NULL; |
274 | 0 | } |
275 | | |
276 | | static int |
277 | | perf_map_free_state(void *state) |
278 | 0 | { |
279 | 0 | PyUnstable_PerfMapState_Fini(); |
280 | 0 | return 0; |
281 | 0 | } |
282 | | |
283 | | _PyPerf_Callbacks _Py_perfmap_callbacks = { |
284 | | &perf_map_init_state, |
285 | | &perf_map_write_entry, |
286 | | &perf_map_free_state, |
287 | | }; |
288 | | |
289 | | |
290 | 0 | static size_t round_up(int64_t value, int64_t multiple) { |
291 | 0 | if (multiple == 0) { |
292 | | // Avoid division by zero |
293 | 0 | return value; |
294 | 0 | } |
295 | | |
296 | 0 | int64_t remainder = value % multiple; |
297 | 0 | if (remainder == 0) { |
298 | | // Value is already a multiple of 'multiple' |
299 | 0 | return value; |
300 | 0 | } |
301 | | |
302 | | // Calculate the difference to the next multiple |
303 | 0 | int64_t difference = multiple - remainder; |
304 | | |
305 | | // Add the difference to the value |
306 | 0 | int64_t rounded_up_value = value + difference; |
307 | |
|
308 | 0 | return rounded_up_value; |
309 | 0 | } |
310 | | |
311 | | // TRAMPOLINE MANAGEMENT API |
312 | | |
313 | | static int |
314 | | new_code_arena(void) |
315 | 0 | { |
316 | | // non-trivial programs typically need 64 to 256 kiB. |
317 | 0 | size_t mem_size = 4096 * 16; |
318 | 0 | assert(mem_size % sysconf(_SC_PAGESIZE) == 0); |
319 | 0 | char *memory = |
320 | 0 | mmap(NULL, // address |
321 | 0 | mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, |
322 | 0 | -1, // fd (not used here) |
323 | 0 | 0); // offset (not used here) |
324 | 0 | if (memory == MAP_FAILED) { |
325 | 0 | PyErr_SetFromErrno(PyExc_OSError); |
326 | 0 | PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline"); |
327 | 0 | perf_status = PERF_STATUS_FAILED; |
328 | 0 | return -1; |
329 | 0 | } |
330 | 0 | (void)_PyAnnotateMemoryMap(memory, mem_size, "cpython:perf_trampoline"); |
331 | 0 | void *start = &_Py_trampoline_func_start; |
332 | 0 | void *end = &_Py_trampoline_func_end; |
333 | 0 | size_t code_size = end - start; |
334 | 0 | size_t unaligned_size = code_size + trampoline_api.code_padding; |
335 | 0 | size_t chunk_size = round_up(unaligned_size, trampoline_api.code_alignment); |
336 | 0 | assert(chunk_size % trampoline_api.code_alignment == 0); |
337 | | // TODO: Check the effect of alignment of the code chunks. Initial investigation |
338 | | // showed that this has no effect on performance in x86-64 or aarch64 and the current |
339 | | // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. |
340 | | // |
341 | | // We should check the values in the future and see if there is a |
342 | | // measurable performance improvement by rounding trampolines up to 32-bit |
343 | | // or 64-bit alignment. |
344 | |
|
345 | 0 | size_t n_copies = mem_size / chunk_size; |
346 | 0 | for (size_t i = 0; i < n_copies; i++) { |
347 | 0 | memcpy(memory + i * chunk_size, start, code_size * sizeof(char)); |
348 | 0 | } |
349 | | // Some systems may prevent us from creating executable code on the fly. |
350 | 0 | int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); |
351 | 0 | if (res == -1) { |
352 | 0 | PyErr_SetFromErrno(PyExc_OSError); |
353 | 0 | munmap(memory, mem_size); |
354 | 0 | PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to " |
355 | 0 | "PROT_READ | PROT_EXEC"); |
356 | 0 | return -1; |
357 | 0 | } |
358 | | |
359 | | #ifdef PY_HAVE_INVALIDATE_ICACHE |
360 | | // Before the JIT can run a block of code that has been emitted it must invalidate |
361 | | // the instruction cache on some platforms like arm and aarch64. |
362 | | invalidate_icache(memory, memory + mem_size); |
363 | | #endif |
364 | | |
365 | 0 | code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); |
366 | 0 | if (new_arena == NULL) { |
367 | 0 | PyErr_NoMemory(); |
368 | 0 | munmap(memory, mem_size); |
369 | 0 | PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline"); |
370 | 0 | return -1; |
371 | 0 | } |
372 | | |
373 | 0 | new_arena->start_addr = memory; |
374 | 0 | new_arena->current_addr = memory; |
375 | 0 | new_arena->size = mem_size; |
376 | 0 | new_arena->size_left = mem_size; |
377 | 0 | new_arena->code_size = code_size; |
378 | 0 | new_arena->prev = perf_code_arena; |
379 | 0 | perf_code_arena = new_arena; |
380 | 0 | return 0; |
381 | 0 | } |
382 | | |
383 | | static void |
384 | | free_code_arenas(void) |
385 | 0 | { |
386 | 0 | code_arena_t *cur = perf_code_arena; |
387 | 0 | code_arena_t *prev; |
388 | 0 | perf_code_arena = NULL; // invalid static pointer |
389 | 0 | while (cur) { |
390 | 0 | munmap(cur->start_addr, cur->size); |
391 | 0 | prev = cur->prev; |
392 | 0 | PyMem_RawFree(cur); |
393 | 0 | cur = prev; |
394 | 0 | } |
395 | 0 | } |
396 | | |
397 | | static inline py_trampoline |
398 | | code_arena_new_code(code_arena_t *code_arena) |
399 | 0 | { |
400 | 0 | py_trampoline trampoline = (py_trampoline)code_arena->current_addr; |
401 | 0 | size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, |
402 | 0 | trampoline_api.code_alignment); |
403 | 0 | assert(total_code_size % trampoline_api.code_alignment == 0); |
404 | 0 | code_arena->size_left -= total_code_size; |
405 | 0 | code_arena->current_addr += total_code_size; |
406 | 0 | return trampoline; |
407 | 0 | } |
408 | | |
409 | | static inline py_trampoline |
410 | | compile_trampoline(void) |
411 | 0 | { |
412 | 0 | size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16); |
413 | 0 | if ((perf_code_arena == NULL) || |
414 | 0 | (perf_code_arena->size_left <= total_code_size)) { |
415 | 0 | if (new_code_arena() < 0) { |
416 | 0 | return NULL; |
417 | 0 | } |
418 | 0 | } |
419 | 0 | assert(perf_code_arena->size_left <= perf_code_arena->size); |
420 | 0 | return code_arena_new_code(perf_code_arena); |
421 | 0 | } |
422 | | |
423 | | static PyObject * |
424 | | py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, |
425 | | int throw) |
426 | 0 | { |
427 | 0 | if (perf_status == PERF_STATUS_FAILED || |
428 | 0 | perf_status == PERF_STATUS_NO_INIT) { |
429 | 0 | goto default_eval; |
430 | 0 | } |
431 | 0 | PyCodeObject *co = _PyFrame_GetCode(frame); |
432 | 0 | py_trampoline f = NULL; |
433 | 0 | assert(extra_code_index != -1); |
434 | 0 | int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); |
435 | 0 | if (ret != 0 || f == NULL) { |
436 | | // This is the first time we see this code object so we need |
437 | | // to compile a trampoline for it. |
438 | 0 | py_trampoline new_trampoline = compile_trampoline(); |
439 | 0 | if (new_trampoline == NULL) { |
440 | 0 | goto default_eval; |
441 | 0 | } |
442 | 0 | trampoline_api.write_state(trampoline_api.state, new_trampoline, |
443 | 0 | perf_code_arena->code_size, co); |
444 | 0 | _PyCode_SetExtra((PyObject *)co, extra_code_index, |
445 | 0 | (void *)new_trampoline); |
446 | 0 | trampoline_refcount++; |
447 | 0 | f = new_trampoline; |
448 | 0 | } |
449 | 0 | assert(f != NULL); |
450 | 0 | return f(ts, frame, throw, prev_eval_frame != NULL ? prev_eval_frame : _PyEval_EvalFrameDefault); |
451 | 0 | default_eval: |
452 | | // Something failed, fall back to the default evaluator. |
453 | 0 | if (prev_eval_frame) { |
454 | 0 | return prev_eval_frame(ts, frame, throw); |
455 | 0 | } |
456 | 0 | return _PyEval_EvalFrameDefault(ts, frame, throw); |
457 | 0 | } |
458 | | #endif // PY_HAVE_PERF_TRAMPOLINE |
459 | | |
460 | | int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co) |
461 | 0 | { |
462 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
463 | 0 | py_trampoline f = NULL; |
464 | 0 | assert(extra_code_index != -1); |
465 | 0 | int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); |
466 | 0 | if (ret != 0 || f == NULL) { |
467 | 0 | py_trampoline new_trampoline = compile_trampoline(); |
468 | 0 | if (new_trampoline == NULL) { |
469 | 0 | return 0; |
470 | 0 | } |
471 | 0 | trampoline_api.write_state(trampoline_api.state, new_trampoline, |
472 | 0 | perf_code_arena->code_size, co); |
473 | 0 | trampoline_refcount++; |
474 | 0 | return _PyCode_SetExtra((PyObject *)co, extra_code_index, |
475 | 0 | (void *)new_trampoline); |
476 | 0 | } |
477 | 0 | #endif // PY_HAVE_PERF_TRAMPOLINE |
478 | 0 | return 0; |
479 | 0 | } |
480 | | |
481 | | int |
482 | | _PyIsPerfTrampolineActive(void) |
483 | 0 | { |
484 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
485 | 0 | PyThreadState *tstate = _PyThreadState_GET(); |
486 | 0 | return tstate->interp->eval_frame == py_trampoline_evaluator; |
487 | 0 | #endif |
488 | 0 | return 0; |
489 | 0 | } |
490 | | |
491 | | void |
492 | | _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) |
493 | 0 | { |
494 | 0 | if (callbacks == NULL) { |
495 | 0 | return; |
496 | 0 | } |
497 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
498 | 0 | callbacks->init_state = trampoline_api.init_state; |
499 | 0 | callbacks->write_state = trampoline_api.write_state; |
500 | 0 | callbacks->free_state = trampoline_api.free_state; |
501 | 0 | #endif |
502 | 0 | return; |
503 | 0 | } |
504 | | |
505 | | int |
506 | | _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) |
507 | 0 | { |
508 | 0 | if (callbacks == NULL) { |
509 | 0 | return -1; |
510 | 0 | } |
511 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
512 | 0 | if (trampoline_api.state) { |
513 | 0 | _PyPerfTrampoline_Fini(); |
514 | 0 | } |
515 | 0 | trampoline_api.init_state = callbacks->init_state; |
516 | 0 | trampoline_api.write_state = callbacks->write_state; |
517 | 0 | trampoline_api.free_state = callbacks->free_state; |
518 | 0 | trampoline_api.state = NULL; |
519 | 0 | #endif |
520 | 0 | return 0; |
521 | 0 | } |
522 | | |
523 | | int |
524 | | _PyPerfTrampoline_Init(int activate) |
525 | 0 | { |
526 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
527 | 0 | PyThreadState *tstate = _PyThreadState_GET(); |
528 | 0 | if (code_watcher_id == 0) { |
529 | | // Initialize to -1 since 0 is a valid watcher ID |
530 | 0 | code_watcher_id = -1; |
531 | 0 | } |
532 | 0 | if (!activate) { |
533 | 0 | _PyInterpreterState_SetEvalFrameFunc(tstate->interp, prev_eval_frame); |
534 | 0 | perf_status = PERF_STATUS_NO_INIT; |
535 | 0 | } |
536 | 0 | else if (tstate->interp->eval_frame != py_trampoline_evaluator) { |
537 | 0 | prev_eval_frame = _PyInterpreterState_GetEvalFrameFunc(tstate->interp); |
538 | 0 | _PyInterpreterState_SetEvalFrameFunc(tstate->interp, py_trampoline_evaluator); |
539 | 0 | extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); |
540 | 0 | if (extra_code_index == -1) { |
541 | 0 | return -1; |
542 | 0 | } |
543 | 0 | if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) { |
544 | 0 | trampoline_api.state = trampoline_api.init_state(); |
545 | 0 | } |
546 | 0 | if (new_code_arena() < 0) { |
547 | 0 | return -1; |
548 | 0 | } |
549 | 0 | code_watcher_id = PyCode_AddWatcher(perf_trampoline_code_watcher); |
550 | 0 | if (code_watcher_id < 0) { |
551 | 0 | PyErr_FormatUnraisable("Failed to register code watcher for perf trampoline"); |
552 | 0 | free_code_arenas(); |
553 | 0 | return -1; |
554 | 0 | } |
555 | 0 | trampoline_refcount = 1; // Base refcount held by the system |
556 | 0 | perf_status = PERF_STATUS_OK; |
557 | 0 | } |
558 | 0 | #endif |
559 | 0 | return 0; |
560 | 0 | } |
561 | | |
562 | | int |
563 | | _PyPerfTrampoline_Fini(void) |
564 | 0 | { |
565 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
566 | 0 | if (perf_status != PERF_STATUS_OK) { |
567 | 0 | return 0; |
568 | 0 | } |
569 | 0 | PyThreadState *tstate = _PyThreadState_GET(); |
570 | 0 | if (tstate->interp->eval_frame == py_trampoline_evaluator) { |
571 | 0 | _PyInterpreterState_SetEvalFrameFunc(tstate->interp, NULL); |
572 | 0 | } |
573 | 0 | if (perf_status == PERF_STATUS_OK) { |
574 | 0 | trampoline_api.free_state(trampoline_api.state); |
575 | 0 | perf_trampoline_type = PERF_TRAMPOLINE_UNSET; |
576 | 0 | } |
577 | | |
578 | | // Prevent new trampolines from being created |
579 | 0 | perf_status = PERF_STATUS_NO_INIT; |
580 | | |
581 | | // Decrement base refcount. If refcount reaches 0, all code objects are already |
582 | | // dead so clean up now. Otherwise, watcher remains active to clean up when last |
583 | | // code object dies; extra_code_index stays valid so watcher can identify them. |
584 | 0 | trampoline_refcount--; |
585 | 0 | if (trampoline_refcount == 0) { |
586 | 0 | perf_trampoline_reset_state(); |
587 | 0 | } |
588 | 0 | #endif |
589 | 0 | return 0; |
590 | 0 | } |
591 | | |
592 | | int |
593 | 0 | PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){ |
594 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
595 | 0 | persist_after_fork = enable; |
596 | 0 | return persist_after_fork; |
597 | 0 | #endif |
598 | 0 | return 0; |
599 | 0 | } |
600 | | |
601 | | PyStatus |
602 | | _PyPerfTrampoline_AfterFork_Child(void) |
603 | 0 | { |
604 | 0 | #ifdef PY_HAVE_PERF_TRAMPOLINE |
605 | 0 | if (persist_after_fork) { |
606 | 0 | if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) { |
607 | 0 | return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map."); |
608 | 0 | } |
609 | 0 | _PyPerfTrampoline_Fini(); |
610 | 0 | char filename[256]; |
611 | 0 | pid_t parent_pid = getppid(); |
612 | 0 | snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid); |
613 | 0 | if (PyUnstable_CopyPerfMapFile(filename) != 0) { |
614 | 0 | return PyStatus_Error("Failed to copy perf map file."); |
615 | 0 | } |
616 | 0 | } else { |
617 | | // Restart trampoline in file in child. |
618 | 0 | int was_active = _PyIsPerfTrampolineActive(); |
619 | 0 | _PyPerfTrampoline_Fini(); |
620 | 0 | if (was_active) { |
621 | 0 | _PyPerfTrampoline_Init(1); |
622 | 0 | } |
623 | 0 | } |
624 | 0 | #endif |
625 | 0 | return PyStatus_Ok(); |
626 | 0 | } |