/src/cpython/Python/perf_trampoline.c

Source
/*

Perf trampoline instrumentation
===============================

This file contains instrumentation to allow to associate
calls to the CPython eval loop back to the names of the Python
functions and filename being executed.

Many native performance profilers like the Linux perf tools are
only available to 'see' the C stack when sampling from the profiled
process. This means that if we have the following python code:

    import time
    def foo(n):
        # Some CPU intensive code

    def bar(n):
        foo(n)

    def baz(n):
        bar(n)

    baz(10000000)

A performance profiler that is only able to see native frames will
produce the following backtrace when sampling from foo():

    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    ...

    Py_RunMain

Because the profiler is only able to see the native frames and the native
function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
then the profiler and any reporter generated by it will not be able to
associate the names of the Python functions and the filenames associated with
those calls, rendering the results useless in the Python world.

To fix this problem, we introduce the concept of a trampoline frame. A
trampoline frame is a piece of code that is unique per Python code object that
is executed before entering the CPython eval loop. This piece of code just
calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
forwards all the arguments received. In this way, when a profiler samples
frames from the previous example it will see;

    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
    [Jit compiled code 3]
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
    [Jit compiled code 2]
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
    [Jit compiled code 1]
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    ...

    Py_RunMain

When we generate every unique copy of the trampoline (what here we called "[Jit
compiled code N]") we write the relationship between the compiled code and the
Python function that is associated with it. Every profiler requires this
information in a different format. For example, the Linux "perf" profiler
requires a file in "/tmp/perf-PID.map" (name and location not configurable)
with the following format:

    <compiled code address> <compiled code size> <name of the compiled code>

If this file is available when "perf" generates reports, it will automatically
associate every trampoline with the Python function that it is associated with
allowing it to generate reports that include Python information. These reports
then can also be filtered in a way that *only* Python information appears.

Notice that for this to work, there must be a unique copied of the trampoline
per Python code object even if the code in the trampoline is the same. To
achieve this we have a assembly template in Objects/asm_trampiline.S that is
compiled into the Python executable/shared library. This template generates a
symbol that maps the start of the assembly code and another that marks the end
of the assembly code for the trampoline.  Then, every time we need a unique
trampoline for a Python code object, we copy the assembly code into a mmaped
area that has executable permissions and we return the start of that area as
our trampoline function.

Asking for a mmap-ed memory area for trampoline is very wasteful so we
allocate big arenas of memory in a single mmap call, we populate the entire
arena with copies of the trampoline (this allows us to now have to invalidate
the icache for the instructions in the page) and then we return the next
available chunk every time someone asks for a new trampoline. We keep a linked
list of arenas in case the current memory arena is exhausted and another one is
needed.

For the best results, Python should be compiled with
CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
profilers to unwind using only the frame pointer and not on DWARF debug
information (note that as trampilines are dynamically generated there won't be
any DWARF information available for them).
*/

#include "Python.h"
#include "pycore_ceval.h"         // _PyPerf_Callbacks
#include "pycore_interpframe.h"   // _PyFrame_GetCode()
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
#include "pycore_runtime.h"       // _PyRuntime


#ifdef PY_HAVE_PERF_TRAMPOLINE

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>             // mmap()
#include <sys/types.h>
#include <unistd.h>               // sysconf()
#include <sys/time.h>           // gettimeofday()


#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
#define PY_HAVE_INVALIDATE_ICACHE

#if defined(__clang__) || defined(__GNUC__)
extern void __clear_cache(void *, void*);
#endif

static void invalidate_icache(char* begin, char*end) {
#if defined(__clang__) || defined(__GNUC__)
    return __clear_cache(begin, end);
#else
    return;
#endif
}
#endif

/* The function pointer is passed as last argument. The other three arguments
 * are passed in the same order as the function requires. This results in
 * shorter, more efficient ASM code for trampoline.
 */
typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
                                  int throwflag);
typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
                                   py_evaluator);

extern void *_Py_trampoline_func_start;  // Start of the template of the
                                         // assembly trampoline
extern void *
    _Py_trampoline_func_end;  // End of the template of the assembly trampoline

struct code_arena_st {
    char *start_addr;    // Start of the memory arena
    char *current_addr;  // Address of the current trampoline within the arena
    size_t size;         // Size of the memory arena
    size_t size_left;    // Remaining size of the memory arena
    size_t code_size;    // Size of the code of every trampoline in the arena
    struct code_arena_st
        *prev;  // Pointer to the arena  or NULL if this is the first arena.
};

typedef struct code_arena_st code_arena_t;
typedef struct trampoline_api_st trampoline_api_t;

enum perf_trampoline_type {
    PERF_TRAMPOLINE_UNSET = 0,
    PERF_TRAMPOLINE_TYPE_MAP = 1,
    PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
};

#define perf_status _PyRuntime.ceval.perf.status
#define extra_code_index _PyRuntime.ceval.perf.extra_code_index
#define perf_code_arena _PyRuntime.ceval.perf.code_arena
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
#define perf_map_file _PyRuntime.ceval.perf.map_file
#define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
#define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
#define prev_eval_frame _PyRuntime.ceval.perf.prev_eval_frame

static void
perf_map_write_entry(void *state, const void *code_addr,
                         unsigned int code_size, PyCodeObject *co)
{
    const char *entry = "";
    if (co->co_qualname != NULL) {
        entry = PyUnicode_AsUTF8(co->co_qualname);
    }
    const char *filename = "";
    if (co->co_filename != NULL) {
        filename = PyUnicode_AsUTF8(co->co_filename);
    }
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
    if (perf_map_entry == NULL) {
        return;
    }
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
    PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
    PyMem_RawFree(perf_map_entry);
}

static void*
perf_map_init_state(void)
{
    PyUnstable_PerfMapState_Init();
    trampoline_api.code_padding = 0;
    trampoline_api.code_alignment = 32;
    perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
    return NULL;
}

static int
perf_map_free_state(void *state)
{
    PyUnstable_PerfMapState_Fini();
    return 0;
}

_PyPerf_Callbacks _Py_perfmap_callbacks = {
    &perf_map_init_state,
    &perf_map_write_entry,
    &perf_map_free_state,
};


static size_t round_up(int64_t value, int64_t multiple) {
    if (multiple == 0) {
        // Avoid division by zero
        return value;
    }

    int64_t remainder = value % multiple;
    if (remainder == 0) {
        // Value is already a multiple of 'multiple'
        return value;
    }

    // Calculate the difference to the next multiple
    int64_t difference = multiple - remainder;

    // Add the difference to the value
    int64_t rounded_up_value = value + difference;

    return rounded_up_value;
}

// TRAMPOLINE MANAGEMENT API

static int
new_code_arena(void)
{
    // non-trivial programs typically need 64 to 256 kiB.
    size_t mem_size = 4096 * 16;
    assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
    char *memory =
        mmap(NULL,  // address
             mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
             -1,  // fd (not used here)
             0);  // offset (not used here)
    if (memory == MAP_FAILED) {
        PyErr_SetFromErrno(PyExc_OSError);
        PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
        perf_status = PERF_STATUS_FAILED;
        return -1;
    }
    _PyAnnotateMemoryMap(memory, mem_size, "cpython:perf_trampoline");
    void *start = &_Py_trampoline_func_start;
    void *end = &_Py_trampoline_func_end;
    size_t code_size = end - start;
    size_t unaligned_size = code_size + trampoline_api.code_padding;
    size_t chunk_size = round_up(unaligned_size, trampoline_api.code_alignment);
    assert(chunk_size % trampoline_api.code_alignment == 0);
    // TODO: Check the effect of alignment of the code chunks. Initial investigation
    // showed that this has no effect on performance in x86-64 or aarch64 and the current
    // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
    //
    // We should check the values in the future and see if there is a
    // measurable performance improvement by rounding trampolines up to 32-bit
    // or 64-bit alignment.

    size_t n_copies = mem_size / chunk_size;
    for (size_t i = 0; i < n_copies; i++) {
        memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
    }
    // Some systems may prevent us from creating executable code on the fly.
    int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
    if (res == -1) {
        PyErr_SetFromErrno(PyExc_OSError);
        munmap(memory, mem_size);
        PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
                               "PROT_READ | PROT_EXEC");
        return -1;
    }

#ifdef PY_HAVE_INVALIDATE_ICACHE
    // Before the JIT can run a block of code that has been emitted it must invalidate
    // the instruction cache on some platforms like arm and aarch64.
    invalidate_icache(memory, memory + mem_size);
#endif

    code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
    if (new_arena == NULL) {
        PyErr_NoMemory();
        munmap(memory, mem_size);
        PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
        return -1;
    }

    new_arena->start_addr = memory;
    new_arena->current_addr = memory;
    new_arena->size = mem_size;
    new_arena->size_left = mem_size;
    new_arena->code_size = code_size;
    new_arena->prev = perf_code_arena;
    perf_code_arena = new_arena;
    return 0;
}

static void
free_code_arenas(void)
{
    code_arena_t *cur = perf_code_arena;
    code_arena_t *prev;
    perf_code_arena = NULL;  // invalid static pointer
    while (cur) {
        munmap(cur->start_addr, cur->size);
        prev = cur->prev;
        PyMem_RawFree(cur);
        cur = prev;
    }
}

static inline py_trampoline
code_arena_new_code(code_arena_t *code_arena)
{
    py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
    size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding,
                                  trampoline_api.code_alignment);
    assert(total_code_size % trampoline_api.code_alignment == 0);
    code_arena->size_left -= total_code_size;
    code_arena->current_addr += total_code_size;
    return trampoline;
}

static inline py_trampoline
compile_trampoline(void)
{
    size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
    if ((perf_code_arena == NULL) ||
        (perf_code_arena->size_left <= total_code_size)) {
        if (new_code_arena() < 0) {
            return NULL;
        }
    }
    assert(perf_code_arena->size_left <= perf_code_arena->size);
    return code_arena_new_code(perf_code_arena);
}

static PyObject *
py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                        int throw)
{
    if (perf_status == PERF_STATUS_FAILED ||
        perf_status == PERF_STATUS_NO_INIT) {
        goto default_eval;
    }
    PyCodeObject *co = _PyFrame_GetCode(frame);
    py_trampoline f = NULL;
    assert(extra_code_index != -1);
    int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
    if (ret != 0 || f == NULL) {
        // This is the first time we see this code object so we need
        // to compile a trampoline for it.
        py_trampoline new_trampoline = compile_trampoline();
        if (new_trampoline == NULL) {
            goto default_eval;
        }
        trampoline_api.write_state(trampoline_api.state, new_trampoline,
                                   perf_code_arena->code_size, co);
        _PyCode_SetExtra((PyObject *)co, extra_code_index,
                         (void *)new_trampoline);
        f = new_trampoline;
    }
    assert(f != NULL);
    return f(ts, frame, throw, prev_eval_frame != NULL ? prev_eval_frame : _PyEval_EvalFrameDefault);
default_eval:
    // Something failed, fall back to the default evaluator.
    if (prev_eval_frame) {
        return prev_eval_frame(ts, frame, throw);
    }
    return _PyEval_EvalFrameDefault(ts, frame, throw);
}
#endif  // PY_HAVE_PERF_TRAMPOLINE

int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
    py_trampoline f = NULL;
    assert(extra_code_index != -1);
    int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
    if (ret != 0 || f == NULL) {
        py_trampoline new_trampoline = compile_trampoline();
        if (new_trampoline == NULL) {
            return 0;
        }
        trampoline_api.write_state(trampoline_api.state, new_trampoline,
                                   perf_code_arena->code_size, co);
        return _PyCode_SetExtra((PyObject *)co, extra_code_index,
                         (void *)new_trampoline);
    }
#endif // PY_HAVE_PERF_TRAMPOLINE
    return 0;
}

int
_PyIsPerfTrampolineActive(void)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
    PyThreadState *tstate = _PyThreadState_GET();
    return tstate->interp->eval_frame == py_trampoline_evaluator;
#endif
    return 0;
}

void
_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
{
    if (callbacks == NULL) {
        return;
    }
#ifdef PY_HAVE_PERF_TRAMPOLINE
    callbacks->init_state = trampoline_api.init_state;
    callbacks->write_state = trampoline_api.write_state;
    callbacks->free_state = trampoline_api.free_state;
#endif
    return;
}

int
_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
{
    if (callbacks == NULL) {
        return -1;
    }
#ifdef PY_HAVE_PERF_TRAMPOLINE
    if (trampoline_api.state) {
        _PyPerfTrampoline_Fini();
    }
    trampoline_api.init_state = callbacks->init_state;
    trampoline_api.write_state = callbacks->write_state;
    trampoline_api.free_state = callbacks->free_state;
    trampoline_api.state = NULL;
#endif
    return 0;
}

int
_PyPerfTrampoline_Init(int activate)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
    PyThreadState *tstate = _PyThreadState_GET();
    if (!activate) {
        _PyInterpreterState_SetEvalFrameFunc(tstate->interp, prev_eval_frame);
        perf_status = PERF_STATUS_NO_INIT;
    }
    else if (tstate->interp->eval_frame != py_trampoline_evaluator) {
        prev_eval_frame = _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
        _PyInterpreterState_SetEvalFrameFunc(tstate->interp, py_trampoline_evaluator);
        extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
        if (extra_code_index == -1) {
            return -1;
        }
        if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
            trampoline_api.state = trampoline_api.init_state();
        }
        if (new_code_arena() < 0) {
            return -1;
        }
        perf_status = PERF_STATUS_OK;
    }
#endif
    return 0;
}

int
_PyPerfTrampoline_Fini(void)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
    if (perf_status != PERF_STATUS_OK) {
        return 0;
    }
    PyThreadState *tstate = _PyThreadState_GET();
    if (tstate->interp->eval_frame == py_trampoline_evaluator) {
        _PyInterpreterState_SetEvalFrameFunc(tstate->interp, NULL);
    }
    if (perf_status == PERF_STATUS_OK) {
        trampoline_api.free_state(trampoline_api.state);
        perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
    }
    extra_code_index = -1;
    perf_status = PERF_STATUS_NO_INIT;
#endif
    return 0;
}

void _PyPerfTrampoline_FreeArenas(void) {
#ifdef PY_HAVE_PERF_TRAMPOLINE
    free_code_arenas();
#endif
    return;
}

int
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
#ifdef PY_HAVE_PERF_TRAMPOLINE
    persist_after_fork = enable;
    return persist_after_fork;
#endif
    return 0;
}

PyStatus
_PyPerfTrampoline_AfterFork_Child(void)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
    if (persist_after_fork) {
        if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
            return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
        }
        _PyPerfTrampoline_Fini();
        char filename[256];
        pid_t parent_pid = getppid();
        snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
        if (PyUnstable_CopyPerfMapFile(filename) != 0) {
            return PyStatus_Error("Failed to copy perf map file.");
        }
    } else {
        // Restart trampoline in file in child.
        int was_active = _PyIsPerfTrampolineActive();
        _PyPerfTrampoline_Fini();
        if (was_active) {
            _PyPerfTrampoline_Init(1);
        }
    }
#endif
    return PyStatus_Ok();
}

Coverage Report

Created: 2025-12-14 07:06

Line	Count	Source
1		/*
2
3		Perf trampoline instrumentation
4		===============================
5
6		This file contains instrumentation to allow to associate
7		calls to the CPython eval loop back to the names of the Python
8		functions and filename being executed.
9
10		Many native performance profilers like the Linux perf tools are
11		only available to 'see' the C stack when sampling from the profiled
12		process. This means that if we have the following python code:
13
14		import time
15		def foo(n):
16		# Some CPU intensive code
17
18		def bar(n):
19		foo(n)
20
21		def baz(n):
22		bar(n)
23
24		baz(10000000)
25
26		A performance profiler that is only able to see native frames will
27		produce the following backtrace when sampling from foo():
28
29		_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
30		_PyEval_Vector
31		_PyFunction_Vectorcall
32		PyObject_Vectorcall
33		call_function
34
35		_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
36		_PyEval_EvalFrame
37		_PyEval_Vector
38		_PyFunction_Vectorcall
39		PyObject_Vectorcall
40		call_function
41
42		_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
43		_PyEval_EvalFrame
44		_PyEval_Vector
45		_PyFunction_Vectorcall
46		PyObject_Vectorcall
47		call_function
48
49		...
50
51		Py_RunMain
52
53		Because the profiler is only able to see the native frames and the native
54		function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
55		then the profiler and any reporter generated by it will not be able to
56		associate the names of the Python functions and the filenames associated with
57		those calls, rendering the results useless in the Python world.
58
59		To fix this problem, we introduce the concept of a trampoline frame. A
60		trampoline frame is a piece of code that is unique per Python code object that
61		is executed before entering the CPython eval loop. This piece of code just
62		calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
63		forwards all the arguments received. In this way, when a profiler samples
64		frames from the previous example it will see;
65
66		_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
67		[Jit compiled code 3]
68		_PyEval_Vector
69		_PyFunction_Vectorcall
70		PyObject_Vectorcall
71		call_function
72
73		_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
74		[Jit compiled code 2]
75		_PyEval_EvalFrame
76		_PyEval_Vector
77		_PyFunction_Vectorcall
78		PyObject_Vectorcall
79		call_function
80
81		_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
82		[Jit compiled code 1]
83		_PyEval_EvalFrame
84		_PyEval_Vector
85		_PyFunction_Vectorcall
86		PyObject_Vectorcall
87		call_function
88
89		...
90
91		Py_RunMain
92
93		When we generate every unique copy of the trampoline (what here we called "[Jit
94		compiled code N]") we write the relationship between the compiled code and the
95		Python function that is associated with it. Every profiler requires this
96		information in a different format. For example, the Linux "perf" profiler
97		requires a file in "/tmp/perf-PID.map" (name and location not configurable)
98		with the following format:
99
100		<compiled code address> <compiled code size> <name of the compiled code>
101
102		If this file is available when "perf" generates reports, it will automatically
103		associate every trampoline with the Python function that it is associated with
104		allowing it to generate reports that include Python information. These reports
105		then can also be filtered in a way that only Python information appears.
106
107		Notice that for this to work, there must be a unique copied of the trampoline
108		per Python code object even if the code in the trampoline is the same. To
109		achieve this we have a assembly template in Objects/asm_trampiline.S that is
110		compiled into the Python executable/shared library. This template generates a
111		symbol that maps the start of the assembly code and another that marks the end
112		of the assembly code for the trampoline. Then, every time we need a unique
113		trampoline for a Python code object, we copy the assembly code into a mmaped
114		area that has executable permissions and we return the start of that area as
115		our trampoline function.
116
117		Asking for a mmap-ed memory area for trampoline is very wasteful so we
118		allocate big arenas of memory in a single mmap call, we populate the entire
119		arena with copies of the trampoline (this allows us to now have to invalidate
120		the icache for the instructions in the page) and then we return the next
121		available chunk every time someone asks for a new trampoline. We keep a linked
122		list of arenas in case the current memory arena is exhausted and another one is
123		needed.
124
125		For the best results, Python should be compiled with
126		CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
127		profilers to unwind using only the frame pointer and not on DWARF debug
128		information (note that as trampilines are dynamically generated there won't be
129		any DWARF information available for them).
130		*/
131
132		#include "Python.h"
133		#include "pycore_ceval.h" // _PyPerf_Callbacks
134		#include "pycore_interpframe.h" // _PyFrame_GetCode()
135		#include "pycore_mmap.h" // _PyAnnotateMemoryMap()
136		#include "pycore_runtime.h" // _PyRuntime
137
138
139		#ifdef PY_HAVE_PERF_TRAMPOLINE
140
141		#include <fcntl.h>
142		#include <stdio.h>
143		#include <stdlib.h>
144		#include <sys/mman.h> // mmap()
145		#include <sys/types.h>
146		#include <unistd.h> // sysconf()
147		#include <sys/time.h> // gettimeofday()
148
149
150		#if defined(__arm__) \|\| defined(__arm64__) \|\| defined(__aarch64__)
151		#define PY_HAVE_INVALIDATE_ICACHE
152
153		#if defined(__clang__) \|\| defined(__GNUC__)
154		extern void __clear_cache(void , void);
155		#endif
156
157		static void invalidate_icache(char* begin, char*end) {
158		#if defined(__clang__) \|\| defined(__GNUC__)
159		return __clear_cache(begin, end);
160		#else
161		return;
162		#endif
163		}
164		#endif
165
166		/* The function pointer is passed as last argument. The other three arguments
167		* are passed in the same order as the function requires. This results in
168		* shorter, more efficient ASM code for trampoline.
169		*/
170		typedef PyObject (py_evaluator)(PyThreadState , _PyInterpreterFrame ,
171		int throwflag);
172		typedef PyObject (py_trampoline)(PyThreadState , _PyInterpreterFrame , int,
173		py_evaluator);
174
175		extern void *_Py_trampoline_func_start; // Start of the template of the
176		// assembly trampoline
177		extern void *
178		_Py_trampoline_func_end; // End of the template of the assembly trampoline
179
180		struct code_arena_st {
181		char *start_addr; // Start of the memory arena
182		char *current_addr; // Address of the current trampoline within the arena
183		size_t size; // Size of the memory arena
184		size_t size_left; // Remaining size of the memory arena
185		size_t code_size; // Size of the code of every trampoline in the arena
186		struct code_arena_st
187		*prev; // Pointer to the arena or NULL if this is the first arena.
188		};
189
190		typedef struct code_arena_st code_arena_t;
191		typedef struct trampoline_api_st trampoline_api_t;
192
193		enum perf_trampoline_type {
194		PERF_TRAMPOLINE_UNSET = 0,
195		PERF_TRAMPOLINE_TYPE_MAP = 1,
196		PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
197		};
198
199	0	#define perf_status _PyRuntime.ceval.perf.status
200	0	#define extra_code_index _PyRuntime.ceval.perf.extra_code_index
201	0	#define perf_code_arena _PyRuntime.ceval.perf.code_arena
202	0	#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
203		#define perf_map_file _PyRuntime.ceval.perf.map_file
204	0	#define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
205	0	#define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
206	0	#define prev_eval_frame _PyRuntime.ceval.perf.prev_eval_frame
207
208		static void
209		perf_map_write_entry(void state, const void code_addr,
210		unsigned int code_size, PyCodeObject *co)
211	0	{
212	0	const char *entry = "";
213	0	if (co->co_qualname != NULL) {
214	0	entry = PyUnicode_AsUTF8(co->co_qualname);
215	0	}
216	0	const char *filename = "";
217	0	if (co->co_filename != NULL) {
218	0	filename = PyUnicode_AsUTF8(co->co_filename);
219	0	}
220	0	size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
221	0	char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
222	0	if (perf_map_entry == NULL) {
223	0	return;
224	0	}
225	0	snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
226	0	PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
227	0	PyMem_RawFree(perf_map_entry);
228	0	}
229
230		static void*
231		perf_map_init_state(void)
232	0	{
233	0	PyUnstable_PerfMapState_Init();
234	0	trampoline_api.code_padding = 0;
235	0	trampoline_api.code_alignment = 32;
236	0	perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
237	0	return NULL;
238	0	}
239
240		static int
241		perf_map_free_state(void *state)
242	0	{
243	0	PyUnstable_PerfMapState_Fini();
244	0	return 0;
245	0	}
246
247		_PyPerf_Callbacks _Py_perfmap_callbacks = {
248		&perf_map_init_state,
249		&perf_map_write_entry,
250		&perf_map_free_state,
251		};
252
253
254	0	static size_t round_up(int64_t value, int64_t multiple) {
255	0	if (multiple == 0) {
256		// Avoid division by zero
257	0	return value;
258	0	}
259
260	0	int64_t remainder = value % multiple;
261	0	if (remainder == 0) {
262		// Value is already a multiple of 'multiple'
263	0	return value;
264	0	}
265
266		// Calculate the difference to the next multiple
267	0	int64_t difference = multiple - remainder;
268
269		// Add the difference to the value
270	0	int64_t rounded_up_value = value + difference;
271
272	0	return rounded_up_value;
273	0	}
274
275		// TRAMPOLINE MANAGEMENT API
276
277		static int
278		new_code_arena(void)
279	0	{
280		// non-trivial programs typically need 64 to 256 kiB.
281	0	size_t mem_size = 4096 * 16;
282	0	assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
283	0	char *memory =
284	0	mmap(NULL, // address
285	0	mem_size, PROT_READ \| PROT_WRITE, MAP_PRIVATE \| MAP_ANONYMOUS,
286	0	-1, // fd (not used here)
287	0	0); // offset (not used here)
288	0	if (memory == MAP_FAILED) {
289	0	PyErr_SetFromErrno(PyExc_OSError);
290	0	PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
291	0	perf_status = PERF_STATUS_FAILED;
292	0	return -1;
293	0	}
294	0	_PyAnnotateMemoryMap(memory, mem_size, "cpython:perf_trampoline");
295	0	void *start = &_Py_trampoline_func_start;
296	0	void *end = &_Py_trampoline_func_end;
297	0	size_t code_size = end - start;
298	0	size_t unaligned_size = code_size + trampoline_api.code_padding;
299	0	size_t chunk_size = round_up(unaligned_size, trampoline_api.code_alignment);
300	0	assert(chunk_size % trampoline_api.code_alignment == 0);
301		// TODO: Check the effect of alignment of the code chunks. Initial investigation
302		// showed that this has no effect on performance in x86-64 or aarch64 and the current
303		// version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
304		//
305		// We should check the values in the future and see if there is a
306		// measurable performance improvement by rounding trampolines up to 32-bit
307		// or 64-bit alignment.
308
309	0	size_t n_copies = mem_size / chunk_size;
310	0	for (size_t i = 0; i < n_copies; i++) {
311	0	memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
312	0	}
313		// Some systems may prevent us from creating executable code on the fly.
314	0	int res = mprotect(memory, mem_size, PROT_READ \| PROT_EXEC);
315	0	if (res == -1) {
316	0	PyErr_SetFromErrno(PyExc_OSError);
317	0	munmap(memory, mem_size);
318	0	PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
319	0	"PROT_READ \| PROT_EXEC");
320	0	return -1;
321	0	}
322
323		#ifdef PY_HAVE_INVALIDATE_ICACHE
324		// Before the JIT can run a block of code that has been emitted it must invalidate
325		// the instruction cache on some platforms like arm and aarch64.
326		invalidate_icache(memory, memory + mem_size);
327		#endif
328
329	0	code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
330	0	if (new_arena == NULL) {
331	0	PyErr_NoMemory();
332	0	munmap(memory, mem_size);
333	0	PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
334	0	return -1;
335	0	}
336
337	0	new_arena->start_addr = memory;
338	0	new_arena->current_addr = memory;
339	0	new_arena->size = mem_size;
340	0	new_arena->size_left = mem_size;
341	0	new_arena->code_size = code_size;
342	0	new_arena->prev = perf_code_arena;
343	0	perf_code_arena = new_arena;
344	0	return 0;
345	0	}
346
347		static void
348		free_code_arenas(void)
349	0	{
350	0	code_arena_t *cur = perf_code_arena;
351	0	code_arena_t *prev;
352	0	perf_code_arena = NULL; // invalid static pointer
353	0	while (cur) {
354	0	munmap(cur->start_addr, cur->size);
355	0	prev = cur->prev;
356	0	PyMem_RawFree(cur);
357	0	cur = prev;
358	0	}
359	0	}
360
361		static inline py_trampoline
362		code_arena_new_code(code_arena_t *code_arena)
363	0	{
364	0	py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
365	0	size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding,
366	0	trampoline_api.code_alignment);
367	0	assert(total_code_size % trampoline_api.code_alignment == 0);
368	0	code_arena->size_left -= total_code_size;
369	0	code_arena->current_addr += total_code_size;
370	0	return trampoline;
371	0	}
372
373		static inline py_trampoline
374		compile_trampoline(void)
375	0	{
376	0	size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
377	0	if ((perf_code_arena == NULL) \|\|
378	0	(perf_code_arena->size_left <= total_code_size)) {
379	0	if (new_code_arena() < 0) {
380	0	return NULL;
381	0	}
382	0	}
383	0	assert(perf_code_arena->size_left <= perf_code_arena->size);
384	0	return code_arena_new_code(perf_code_arena);
385	0	}
386
387		static PyObject *
388		py_trampoline_evaluator(PyThreadState ts, _PyInterpreterFrame frame,
389		int throw)
390	0	{
391	0	if (perf_status == PERF_STATUS_FAILED \|\|
392	0	perf_status == PERF_STATUS_NO_INIT) {
393	0	goto default_eval;
394	0	}
395	0	PyCodeObject *co = _PyFrame_GetCode(frame);
396	0	py_trampoline f = NULL;
397	0	assert(extra_code_index != -1);
398	0	int ret = _PyCode_GetExtra((PyObject )co, extra_code_index, (void *)&f);
399	0	if (ret != 0 \|\| f == NULL) {
400		// This is the first time we see this code object so we need
401		// to compile a trampoline for it.
402	0	py_trampoline new_trampoline = compile_trampoline();
403	0	if (new_trampoline == NULL) {
404	0	goto default_eval;
405	0	}
406	0	trampoline_api.write_state(trampoline_api.state, new_trampoline,
407	0	perf_code_arena->code_size, co);
408	0	_PyCode_SetExtra((PyObject *)co, extra_code_index,
409	0	(void *)new_trampoline);
410	0	f = new_trampoline;
411	0	}
412	0	assert(f != NULL);
413	0	return f(ts, frame, throw, prev_eval_frame != NULL ? prev_eval_frame : _PyEval_EvalFrameDefault);
414	0	default_eval:
415		// Something failed, fall back to the default evaluator.
416	0	if (prev_eval_frame) {
417	0	return prev_eval_frame(ts, frame, throw);
418	0	}
419	0	return _PyEval_EvalFrameDefault(ts, frame, throw);
420	0	}
421		#endif // PY_HAVE_PERF_TRAMPOLINE
422
423		int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
424	0	{
425	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
426	0	py_trampoline f = NULL;
427	0	assert(extra_code_index != -1);
428	0	int ret = _PyCode_GetExtra((PyObject )co, extra_code_index, (void *)&f);
429	0	if (ret != 0 \|\| f == NULL) {
430	0	py_trampoline new_trampoline = compile_trampoline();
431	0	if (new_trampoline == NULL) {
432	0	return 0;
433	0	}
434	0	trampoline_api.write_state(trampoline_api.state, new_trampoline,
435	0	perf_code_arena->code_size, co);
436	0	return _PyCode_SetExtra((PyObject *)co, extra_code_index,
437	0	(void *)new_trampoline);
438	0	}
439	0	#endif // PY_HAVE_PERF_TRAMPOLINE
440	0	return 0;
441	0	}
442
443		int
444		_PyIsPerfTrampolineActive(void)
445	0	{
446	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
447	0	PyThreadState *tstate = _PyThreadState_GET();
448	0	return tstate->interp->eval_frame == py_trampoline_evaluator;
449	0	#endif
450	0	return 0;
451	0	}
452
453		void
454		_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
455	0	{
456	0	if (callbacks == NULL) {
457	0	return;
458	0	}
459	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
460	0	callbacks->init_state = trampoline_api.init_state;
461	0	callbacks->write_state = trampoline_api.write_state;
462	0	callbacks->free_state = trampoline_api.free_state;
463	0	#endif
464	0	return;
465	0	}
466
467		int
468		_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
469	0	{
470	0	if (callbacks == NULL) {
471	0	return -1;
472	0	}
473	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
474	0	if (trampoline_api.state) {
475	0	_PyPerfTrampoline_Fini();
476	0	}
477	0	trampoline_api.init_state = callbacks->init_state;
478	0	trampoline_api.write_state = callbacks->write_state;
479	0	trampoline_api.free_state = callbacks->free_state;
480	0	trampoline_api.state = NULL;
481	0	#endif
482	0	return 0;
483	0	}
484
485		int
486		_PyPerfTrampoline_Init(int activate)
487	0	{
488	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
489	0	PyThreadState *tstate = _PyThreadState_GET();
490	0	if (!activate) {
491	0	_PyInterpreterState_SetEvalFrameFunc(tstate->interp, prev_eval_frame);
492	0	perf_status = PERF_STATUS_NO_INIT;
493	0	}
494	0	else if (tstate->interp->eval_frame != py_trampoline_evaluator) {
495	0	prev_eval_frame = _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
496	0	_PyInterpreterState_SetEvalFrameFunc(tstate->interp, py_trampoline_evaluator);
497	0	extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
498	0	if (extra_code_index == -1) {
499	0	return -1;
500	0	}
501	0	if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
502	0	trampoline_api.state = trampoline_api.init_state();
503	0	}
504	0	if (new_code_arena() < 0) {
505	0	return -1;
506	0	}
507	0	perf_status = PERF_STATUS_OK;
508	0	}
509	0	#endif
510	0	return 0;
511	0	}
512
513		int
514		_PyPerfTrampoline_Fini(void)
515	0	{
516	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
517	0	if (perf_status != PERF_STATUS_OK) {
518	0	return 0;
519	0	}
520	0	PyThreadState *tstate = _PyThreadState_GET();
521	0	if (tstate->interp->eval_frame == py_trampoline_evaluator) {
522	0	_PyInterpreterState_SetEvalFrameFunc(tstate->interp, NULL);
523	0	}
524	0	if (perf_status == PERF_STATUS_OK) {
525	0	trampoline_api.free_state(trampoline_api.state);
526	0	perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
527	0	}
528	0	extra_code_index = -1;
529	0	perf_status = PERF_STATUS_NO_INIT;
530	0	#endif
531	0	return 0;
532	0	}
533
534	0	void _PyPerfTrampoline_FreeArenas(void) {
535	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
536	0	free_code_arenas();
537	0	#endif
538	0	return;
539	0	}
540
541		int
542	0	PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
543	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
544	0	persist_after_fork = enable;
545	0	return persist_after_fork;
546	0	#endif
547	0	return 0;
548	0	}
549
550		PyStatus
551		_PyPerfTrampoline_AfterFork_Child(void)
552	0	{
553	0	#ifdef PY_HAVE_PERF_TRAMPOLINE
554	0	if (persist_after_fork) {
555	0	if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
556	0	return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
557	0	}
558	0	_PyPerfTrampoline_Fini();
559	0	char filename[256];
560	0	pid_t parent_pid = getppid();
561	0	snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
562	0	if (PyUnstable_CopyPerfMapFile(filename) != 0) {
563	0	return PyStatus_Error("Failed to copy perf map file.");
564	0	}
565	0	} else {
566		// Restart trampoline in file in child.
567	0	int was_active = _PyIsPerfTrampolineActive();
568	0	_PyPerfTrampoline_Fini();
569	0	if (was_active) {
570	0	_PyPerfTrampoline_Init(1);
571	0	}
572	0	}
573	0	#endif
574	0	return PyStatus_Ok();
575	0	}