Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/perf_jit_trampoline.c
Line
Count
Source
1
/*
2
 * Python Perf Trampoline Support - JIT Dump Implementation
3
 *
4
 * This file implements the perf jitdump API for Python's performance profiling
5
 * integration. It allows perf (Linux performance analysis tool) to understand
6
 * and profile dynamically generated Python bytecode by creating JIT dump files
7
 * that perf can inject into its analysis.
8
 *
9
 *
10
 * IMPORTANT: This file exports specific callback functions that are part of
11
 * Python's internal API. Do not modify the function signatures or behavior
12
 * of exported functions without coordinating with the Python core team.
13
 *
14
 * Usually the binary and libraries are mapped in separate region like below:
15
 *
16
 *   address ->
17
 *    --+---------------------+--//--+---------------------+--
18
 *      | .text | .data | ... |      | .text | .data | ... |
19
 *    --+---------------------+--//--+---------------------+--
20
 *          myprog                      libc.so
21
 *
22
 * So it'd be easy and straight-forward to find a mapped binary or library from an
23
 * address.
24
 *
25
 * But for JIT code, the code arena only cares about the code section. But the
26
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
27
 * unwind info too. Then it'd generate following address space with synthesized
28
 * MMAP events. Let's say it has a sample between address B and C.
29
 *
30
 *                                                sample
31
 *                                                  |
32
 *   address ->                         A       B   v   C
33
 *   ---------------------------------------------------------------------------------------------------
34
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
35
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
36
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
37
 *     ...
38
 *   ---------------------------------------------------------------------------------------------------
39
 *
40
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
41
 * the unwind info. If it maps both .text section and unwind sections, the sample
42
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
43
 * which one is right. So to make perf happy we have non-overlapping ranges for each
44
 * DSO:
45
 *
46
 *   address ->
47
 *   -------------------------------------------------------------------------------------------------------
48
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
49
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
50
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
51
 *     ...
52
 *   -------------------------------------------------------------------------------------------------------
53
 *
54
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
55
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
56
 */
57
58
59
60
#include "Python.h"
61
#include "pycore_ceval.h"         // _PyPerf_Callbacks
62
#include "pycore_frame.h"
63
#include "pycore_interp.h"
64
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
65
#include "pycore_jit_unwind.h"
66
#include "pycore_runtime.h"       // _PyRuntime
67
68
#ifdef PY_HAVE_PERF_TRAMPOLINE
69
70
/* Standard library includes for perf jitdump implementation */
71
#if defined(__linux__)
72
#  include <elf.h>                // ELF architecture constants
73
#endif
74
#include <fcntl.h>                // File control operations
75
#include <stdio.h>                // Standard I/O operations
76
#include <stdlib.h>               // Standard library functions
77
#include <string.h>               // memcpy, strlen
78
#include <sys/mman.h>             // Memory mapping functions (mmap)
79
#include <sys/types.h>            // System data types
80
#include <unistd.h>               // System calls (sysconf, getpid)
81
#include <sys/time.h>             // Time functions (gettimeofday)
82
#if defined(__linux__)
83
#  include <sys/syscall.h>        // System call interface
84
#endif
85
86
// =============================================================================
87
//                           CONSTANTS AND CONFIGURATION
88
// =============================================================================
89
90
/*
91
 * Memory layout considerations for perf jitdump:
92
 *
93
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
94
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
95
 * Shared Object) files that contain:
96
 * - ELF headers
97
 * - .text section (actual machine code)
98
 * - Unwind information (for stack traces)
99
 *
100
 * To ensure proper address space layout, we add padding between code regions.
101
 * This prevents address conflicts when perf maps the synthesized DSOs.
102
 *
103
 * Memory layout example:
104
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
105
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
106
 *
107
 * The padding size is now calculated automatically during initialization
108
 * based on the actual unwind information requirements.
109
 */
110
111
112
/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
113
#if !defined(__linux__)
114
#  if defined(__i386__) || defined(_M_IX86)
115
#    define EM_386      3
116
#  elif defined(__arm__) || defined(_M_ARM)
117
#    define EM_ARM      40
118
#  elif defined(__x86_64__) || defined(_M_X64)
119
#    define EM_X86_64   62
120
#  elif defined(__aarch64__)
121
#    define EM_AARCH64  183
122
#  elif defined(__riscv)
123
#    define EM_RISCV    243
124
#  endif
125
#endif
126
127
/* Convenient access to the global trampoline API state */
128
0
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
129
130
/* Type aliases for clarity and portability */
131
typedef uint64_t uword;                    // Word-sized unsigned integer
132
typedef const char* CodeComments;          // Code comment strings
133
134
/* Memory size constants */
135
0
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
136
137
// =============================================================================
138
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
139
// =============================================================================
140
141
/*
142
 * Returns the ELF machine architecture constant for the current platform.
143
 * This is required for the jitdump header to correctly identify the target
144
 * architecture for perf processing.
145
 *
146
 */
147
0
static uint64_t GetElfMachineArchitecture(void) {
148
0
#if defined(__x86_64__) || defined(_M_X64)
149
0
    return EM_X86_64;
150
#elif defined(__i386__) || defined(_M_IX86)
151
    return EM_386;
152
#elif defined(__aarch64__)
153
    return EM_AARCH64;
154
#elif defined(__arm__) || defined(_M_ARM)
155
    return EM_ARM;
156
#elif defined(__riscv)
157
    return EM_RISCV;
158
#else
159
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
160
    return 0;
161
#endif
162
0
}
163
164
// =============================================================================
165
//                           PERF JITDUMP DATA STRUCTURES
166
// =============================================================================
167
168
/*
169
 * Perf jitdump file format structures
170
 *
171
 * These structures define the binary format that perf expects for JIT dump files.
172
 * The format is documented in the Linux perf tools source code and must match
173
 * exactly for proper perf integration.
174
 */
175
176
/*
177
 * Jitdump file header - written once at the beginning of each jitdump file
178
 * Contains metadata about the process and jitdump format version
179
 */
180
typedef struct {
181
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
182
    uint32_t version;            // Jitdump format version (currently 1)
183
    uint32_t size;               // Size of this header structure
184
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
185
    uint32_t reserved;           // Reserved field (must be 0)
186
    uint32_t process_id;         // Process ID of the JIT compiler
187
    uint64_t time_stamp;         // Timestamp when jitdump was created
188
    uint64_t flags;              // Feature flags (currently unused)
189
} Header;
190
191
/*
192
 * Perf event types supported by the jitdump format
193
 * Each event type has a corresponding structure format
194
 */
195
enum PerfEvent {
196
    PerfLoad = 0,           // Code load event (new JIT function)
197
    PerfMove = 1,           // Code move event (function relocated)
198
    PerfDebugInfo = 2,      // Debug information event
199
    PerfClose = 3,          // JIT session close event
200
    PerfUnwindingInfo = 4   // Stack unwinding information event
201
};
202
203
/*
204
 * Base event structure - common header for all perf events
205
 * Every event in the jitdump file starts with this structure
206
 */
207
struct BaseEvent {
208
    uint32_t event;         // Event type (from PerfEvent enum)
209
    uint32_t size;          // Total size of this event including payload
210
    uint64_t time_stamp;    // Timestamp when event occurred
211
};
212
213
/*
214
 * Code load event - indicates a new JIT-compiled function is available
215
 * This is the most important event type for Python profiling
216
 */
217
typedef struct {
218
    struct BaseEvent base;   // Common event header
219
    uint32_t process_id;     // Process ID where code was generated
220
#if defined(__APPLE__)
221
    uint64_t thread_id;      // Thread ID where code was generated
222
#else
223
    uint32_t thread_id;      // Thread ID where code was generated
224
#endif
225
    uint64_t vma;            // Virtual memory address where code is loaded
226
    uint64_t code_address;   // Address of the actual machine code
227
    uint64_t code_size;      // Size of the machine code in bytes
228
    uint64_t code_id;        // Unique identifier for this code region
229
    /* Followed by:
230
     * - null-terminated function name string
231
     * - raw machine code bytes
232
     */
233
} CodeLoadEvent;
234
235
/*
236
 * Code unwinding information event - provides DWARF data for stack traces
237
 * Essential for proper stack unwinding during profiling
238
 */
239
typedef struct {
240
    struct BaseEvent base;      // Common event header
241
    uint64_t unwind_data_size;  // Size of the unwinding data
242
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
243
    uint64_t mapped_size;       // Total mapped size (with padding)
244
    /* Followed by:
245
     * - EH frame header
246
     * - DWARF unwinding information
247
     * - Padding to alignment boundary
248
     */
249
} CodeUnwindingInfoEvent;
250
251
/*
252
 * EH Frame Header structure for DWARF unwinding
253
 *
254
 * This header provides metadata about the .eh_frame data that follows.
255
 * It uses PC-relative and data-relative encodings to keep the synthesized
256
 * DSO self-contained when perf injects it.
257
 */
258
typedef struct __attribute__((packed)) {
259
    uint8_t version;
260
    uint8_t eh_frame_ptr_enc;
261
    uint8_t fde_count_enc;
262
    uint8_t table_enc;
263
    int32_t eh_frame_ptr;
264
    uint32_t eh_fde_count;
265
    int32_t from;
266
    int32_t to;
267
} EhFrameHeader;
268
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");
269
270
// =============================================================================
271
//                              GLOBAL STATE MANAGEMENT
272
// =============================================================================
273
274
/*
275
 * Global state for the perf jitdump implementation
276
 *
277
 * This structure maintains all the state needed for generating jitdump files.
278
 * It's designed as a singleton since there's typically only one jitdump file
279
 * per Python process.
280
 */
281
typedef struct {
282
    FILE* perf_map;          // File handle for the jitdump file
283
    PyMutex map_lock;        // Thread synchronization lock
284
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
285
    size_t mapped_size;      // Size of the mapped region
286
    uint32_t code_id;        // Counter for unique code region identifiers
287
    uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
288
} PerfMapJitState;
289
290
/* Global singleton instance */
291
static PerfMapJitState perf_jit_map_state;
292
293
// =============================================================================
294
//                              TIME UTILITIES
295
// =============================================================================
296
297
/* Time conversion constant */
298
static const intptr_t nanoseconds_per_second = 1000000000;
299
300
/*
301
 * Get current monotonic time in nanoseconds
302
 *
303
 * Monotonic time is preferred for event timestamps because it's not affected
304
 * by system clock adjustments. This ensures consistent timing relationships
305
 * between events even if the system clock is changed.
306
 *
307
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
308
 */
309
0
static int64_t get_current_monotonic_ticks(void) {
310
0
    struct timespec ts;
311
0
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
312
0
        Py_UNREACHABLE();  // Should never fail on supported systems
313
0
        return 0;
314
0
    }
315
316
    /* Convert to nanoseconds for maximum precision */
317
0
    int64_t result = ts.tv_sec;
318
0
    result *= nanoseconds_per_second;
319
0
    result += ts.tv_nsec;
320
0
    return result;
321
0
}
322
323
/*
324
 * Get current wall clock time in microseconds
325
 *
326
 * Used for the jitdump file header timestamp. Unlike monotonic time,
327
 * this represents actual wall clock time that can be correlated with
328
 * other system events.
329
 *
330
 * Returns: Current time in microseconds since Unix epoch
331
 */
332
0
static int64_t get_current_time_microseconds(void) {
333
0
    struct timeval tv;
334
0
    if (gettimeofday(&tv, NULL) < 0) {
335
0
        Py_UNREACHABLE();  // Should never fail on supported systems
336
0
        return 0;
337
0
    }
338
0
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
339
0
}
340
341
// =============================================================================
342
//                              FILE I/O UTILITIES
343
// =============================================================================
344
345
/*
346
 * Write data to the jitdump file with error handling
347
 *
348
 * This function ensures that all data is written to the file, handling
349
 * partial writes that can occur with large buffers or when the system
350
 * is under load.
351
 *
352
 * Args:
353
 *   buffer: Pointer to data to write
354
 *   size: Number of bytes to write
355
 */
356
0
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
357
0
    FILE* out_file = perf_jit_map_state.perf_map;
358
0
    const char* ptr = (const char*)(buffer);
359
360
0
    while (size > 0) {
361
0
        const size_t written = fwrite(ptr, 1, size, out_file);
362
0
        if (written == 0) {
363
0
            Py_UNREACHABLE();  // Write failure - should be very rare
364
0
            break;
365
0
        }
366
0
        size -= written;
367
0
        ptr += written;
368
0
    }
369
0
}
370
371
/*
372
 * Write the jitdump file header
373
 *
374
 * The header must be written exactly once at the beginning of each jitdump
375
 * file. It provides metadata that perf uses to parse the rest of the file.
376
 *
377
 * Args:
378
 *   pid: Process ID to include in the header
379
 *   out_file: File handle to write to (currently unused, uses global state)
380
 */
381
0
static void perf_map_jit_write_header(int pid, FILE* out_file) {
382
0
    Header header;
383
384
    /* Initialize header with required values */
385
0
    header.magic = 0x4A695444;                    // "JiTD" magic number
386
0
    header.version = 1;                           // Current jitdump version
387
0
    header.size = sizeof(Header);                 // Header size for validation
388
0
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
389
0
    header.reserved = 0;                          // padding reserved for future use
390
0
    header.process_id = pid;                      // Process identifier
391
0
    header.time_stamp = get_current_time_microseconds();   // Creation time
392
0
    header.flags = 0;                             // No special flags currently used
393
394
0
    perf_map_jit_write_fully(&header, sizeof(header));
395
0
}
396
397
// =============================================================================
398
//                              JITDUMP INITIALIZATION
399
// =============================================================================
400
401
/*
402
 * Initialize the perf jitdump interface
403
 *
404
 * This function sets up everything needed to generate jitdump files:
405
 * 1. Creates the jitdump file with a unique name
406
 * 2. Maps the first page to signal perf that we're using the interface
407
 * 3. Writes the jitdump header
408
 * 4. Initializes synchronization primitives
409
 *
410
 * The memory mapping is crucial - perf detects jitdump files by scanning
411
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
412
 *
413
 * Returns: Pointer to initialized state, or NULL on failure
414
 */
415
0
static void* perf_map_jit_init(void) {
416
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
417
0
    if (perf_jit_map_state.perf_map != NULL) {
418
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
419
0
        return &perf_jit_map_state;
420
0
    }
421
422
0
    char filename[100];
423
0
    int pid = getpid();
424
425
    /* Create unique filename based on process ID */
426
0
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
427
428
    /* Create/open the jitdump file with appropriate permissions */
429
0
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
430
0
    if (fd == -1) {
431
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
432
0
        return NULL;  // Failed to create file
433
0
    }
434
435
    /* Get system page size for memory mapping */
436
0
    const long page_size = sysconf(_SC_PAGESIZE);
437
0
    if (page_size == -1) {
438
0
        close(fd);
439
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
440
0
        return NULL;  // Failed to get page size
441
0
    }
442
443
#if defined(__APPLE__)
444
    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
445
    perf_jit_map_state.mapped_buffer = NULL;
446
#else
447
    /*
448
     * Map the first page of the jitdump file
449
     *
450
     * This memory mapping serves as a signal to perf that this process
451
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
452
     * files that match the jitdump naming pattern.
453
     *
454
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
455
     */
456
0
    perf_jit_map_state.mapped_buffer = mmap(
457
0
        NULL,                    // Let kernel choose address
458
0
        page_size,               // Map one page
459
0
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
460
0
        MAP_PRIVATE,             // Private mapping
461
0
        fd,                      // File descriptor
462
0
        0                        // Offset 0 (first page)
463
0
    );
464
465
0
    if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
466
0
        perf_jit_map_state.mapped_buffer = NULL;
467
0
        close(fd);
468
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
469
0
        return NULL;  // Memory mapping failed
470
0
    }
471
0
    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
472
0
                               "cpython:perf_jit_trampoline");
473
0
#endif
474
475
0
    perf_jit_map_state.mapped_size = page_size;
476
477
    /* Convert file descriptor to FILE* for easier I/O operations */
478
0
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
479
0
    if (perf_jit_map_state.perf_map == NULL) {
480
0
        close(fd);
481
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
482
0
        return NULL;  // Failed to create FILE*
483
0
    }
484
485
    /*
486
     * Set up file buffering for better performance
487
     *
488
     * We use a large buffer (2MB) because jitdump files can be written
489
     * frequently during program execution. Buffering reduces system call
490
     * overhead and improves overall performance.
491
     */
492
0
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
493
494
    /* Write the jitdump file header */
495
0
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
496
497
    /* Initialize code ID counter */
498
0
    perf_jit_map_state.code_id = 0;
499
0
    perf_jit_map_state.build_id_salt =
500
0
        ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();
501
502
    /* Calculate padding size based on actual unwind info requirements */
503
0
    size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
504
0
    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
505
0
    trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
506
0
    trampoline_api.code_alignment = 32;
507
508
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
509
0
    return &perf_jit_map_state;
510
0
}
511
512
// =============================================================================
513
//                              MAIN JITDUMP ENTRY WRITING
514
// =============================================================================
515
516
/*
517
 * Write a complete jitdump entry for a code region with a provided name.
518
 *
519
 * This shares the same implementation as the trampoline callback, but
520
 * allows callers that don't have a PyCodeObject to reuse the jitdump
521
 * infrastructure.
522
 */
523
static void perf_map_jit_write_entry_with_name(
524
    void *state,
525
    const void *code_addr,
526
    size_t code_size,
527
    const char *entry,
528
    const char *filename
529
)
530
0
{
531
    /* Initialize jitdump system on first use */
532
0
    void* ret = perf_map_jit_init();
533
0
    if (ret == NULL) {
534
0
        return;  // Initialization failed, silently abort
535
0
    }
536
537
0
    if (entry == NULL) {
538
0
        entry = "";
539
0
    }
540
0
    if (filename == NULL) {
541
0
        filename = "";
542
0
    }
543
544
    /*
545
     * Create formatted function name for perf display
546
     *
547
     * Format: "py::<function_name>:<filename>"
548
     * The "py::" prefix helps identify Python functions in mixed-language
549
     * profiles (e.g., when profiling C extensions alongside Python code).
550
     */
551
0
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
552
0
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
553
0
    if (perf_map_entry == NULL) {
554
0
        return;  // Memory allocation failed
555
0
    }
556
0
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
557
558
0
    const size_t name_length = strlen(perf_map_entry);
559
0
    uword base = (uword)code_addr;
560
0
    uword size = code_size;
561
562
    /*
563
     * Generate DWARF unwinding information
564
     *
565
     * DWARF data is essential for proper stack unwinding during profiling.
566
     * Without it, perf cannot generate accurate call graphs, especially
567
     * in optimized code where frame pointers may be omitted.
568
     */
569
0
    uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
570
0
    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
571
0
        buffer, sizeof(buffer), code_addr, code_size, 0);
572
0
    if (eh_frame_size == 0) {
573
0
        PyMem_RawFree(perf_map_entry);
574
0
        return;
575
0
    }
576
577
    /*
578
     * A logical jitdump entry is written as multiple records and also consumes
579
     * a process-global code_id. Serialize the whole sequence so concurrent JIT
580
     * compilation cannot interleave records or reuse an ID.
581
     */
582
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
583
584
    /*
585
     * Write Code Unwinding Information Event
586
     *
587
     * This event must be written before the code load event to ensure
588
     * perf has the unwinding information available when it processes
589
     * the code region.
590
     */
591
0
    CodeUnwindingInfoEvent ev2;
592
0
    ev2.base.event = PerfUnwindingInfo;
593
0
    ev2.base.time_stamp = get_current_monotonic_ticks();
594
0
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
595
596
    /* Verify we don't exceed our padding budget */
597
0
    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
598
599
0
    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
600
0
    ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment
601
602
    /* Calculate total event size with padding */
603
0
    int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
604
0
    int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
605
0
    ev2.base.size = (uint32_t)(content_size + padding_size);
606
607
    /* Write the unwinding info event header */
608
0
    perf_map_jit_write_fully(&ev2, sizeof(ev2));
609
610
    /*
611
     * Write EH Frame Header
612
     *
613
     * The EH frame header provides metadata about the DWARF unwinding
614
     * information that follows. It includes pointers and counts that
615
     * help perf navigate the unwinding data efficiently.
616
     */
617
0
    EhFrameHeader f;
618
0
    f.version = 1;
619
0
    f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
620
0
    f.fde_count_enc = DWRF_EH_PE_udata4;
621
0
    f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;
622
623
    /* Calculate relative offsets for EH frame navigation */
624
0
    f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
625
0
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
626
0
    f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
627
0
    uint32_t cie_payload_size;
628
0
    memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
629
0
    int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
630
0
    f.to = -(int32_t)(eh_frame_size - cie_size);
631
632
    /* Write EH frame data and header */
633
0
    perf_map_jit_write_fully(buffer, eh_frame_size);
634
0
    perf_map_jit_write_fully(&f, sizeof(f));
635
636
    /* Write padding to maintain alignment */
637
0
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
638
0
    perf_map_jit_write_fully(&padding_bytes, padding_size);
639
640
    /*
641
     * Write Code Load Event
642
     *
643
     * This event tells perf about the new code region. It includes:
644
     * - Memory addresses and sizes
645
     * - Process and thread identification
646
     * - Function name for symbol resolution
647
     * - The actual machine code bytes
648
     */
649
0
    CodeLoadEvent ev;
650
0
    ev.base.event = PerfLoad;
651
0
    ev.base.size = sizeof(ev) + (name_length+1) + size;
652
0
    ev.base.time_stamp = get_current_monotonic_ticks();
653
0
    ev.process_id = getpid();
654
#if defined(__APPLE__)
655
    pthread_threadid_np(NULL, &ev.thread_id);
656
#else
657
0
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
658
0
#endif
659
0
    ev.vma = base;                       // Virtual memory address
660
0
    ev.code_address = base;              // Same as VMA for our use case
661
0
    ev.code_size = size;
662
663
    /* Assign unique code ID and increment counter */
664
0
    perf_jit_map_state.code_id += 1;
665
0
    ev.code_id = perf_jit_map_state.code_id;
666
667
    /* Write code load event and associated data */
668
0
    perf_map_jit_write_fully(&ev, sizeof(ev));
669
0
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
670
    /*
671
     * Ensure each synthetic DSO has unique .text bytes.
672
     *
673
     * perf merges DSOs that share a build-id. Since trampolines can share
674
     * identical code and unwind bytes, perf may resolve all JIT frames to
675
     * the first symbol it saw (including entries from previous runs when
676
     * build-id caching is enabled). Patch a small marker in the emitted
677
     * bytes to make the build-id depend on a per-process salt and code id
678
     * without modifying the live code.
679
     */
680
0
    uint64_t marker = perf_jit_map_state.build_id_salt ^
681
0
        ((uint64_t)perf_jit_map_state.code_id << 32) ^
682
0
        (uint64_t)code_size;
683
0
    if (size >= sizeof(marker)) {
684
0
        size_t prefix = size - sizeof(marker);
685
0
        perf_map_jit_write_fully((void *)(base), prefix);
686
0
        perf_map_jit_write_fully(&marker, sizeof(marker));
687
0
    }
688
0
    else if (size > 0) {
689
0
        uint8_t tmp[sizeof(marker)];
690
0
        memcpy(tmp, (void *)(base), size);
691
0
        for (size_t i = 0; i < size; i++) {
692
0
            tmp[i] ^= (uint8_t)(marker >> (i * 8));
693
0
        }
694
0
        perf_map_jit_write_fully(tmp, size);
695
0
    }
696
697
    /* Clean up allocated memory */
698
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
699
0
    PyMem_RawFree(perf_map_entry);
700
0
}
701
702
/*
703
 * Write a complete jitdump entry for a Python function
704
 *
705
 * This is the main function called by Python's trampoline system whenever
706
 * a new piece of JIT-compiled code needs to be recorded. It writes both
707
 * the unwinding information and the code load event to the jitdump file.
708
 *
709
 * The function performs these steps:
710
 * 1. Initialize jitdump system if not already done
711
 * 2. Extract function name and filename from Python code object
712
 * 3. Generate DWARF unwinding information
713
 * 4. Write unwinding info event to jitdump file
714
 * 5. Write code load event to jitdump file
715
 *
716
 * Args:
717
 *   state: Jitdump state (currently unused, uses global state)
718
 *   code_addr: Address where the compiled code resides
719
 *   code_size: Size of the compiled code in bytes
720
 *   co: Python code object containing metadata
721
 *
722
 * IMPORTANT: This function signature is part of Python's internal API
723
 * and must not be changed without coordinating with core Python development.
724
 */
725
static void perf_map_jit_write_entry(void *state, const void *code_addr,
726
                                     size_t code_size, PyCodeObject *co)
727
0
{
728
0
    const char *entry = "";
729
0
    const char *filename = "";
730
0
    if (co != NULL) {
731
0
        if (co->co_qualname != NULL) {
732
0
            entry = PyUnicode_AsUTF8(co->co_qualname);
733
0
        }
734
0
        if (co->co_filename != NULL) {
735
0
            filename = PyUnicode_AsUTF8(co->co_filename);
736
0
        }
737
0
    }
738
0
    perf_map_jit_write_entry_with_name(state, code_addr, code_size,
739
0
                                       entry, filename);
740
0
}
741
742
void
743
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
744
                          const char *entry, const char *filename)
745
0
{
746
0
    perf_map_jit_write_entry_with_name(
747
0
        NULL, code_addr, code_size, entry, filename);
748
0
}
749
750
// =============================================================================
751
//                              CLEANUP AND FINALIZATION
752
// =============================================================================
753
754
/*
755
 * Finalize and cleanup the perf jitdump system
756
 *
757
 * This function is called when Python is shutting down or when the
758
 * perf trampoline system is being disabled. It ensures all resources
759
 * are properly released and all buffered data is flushed to disk.
760
 *
761
 * Args:
762
 *   state: Jitdump state (currently unused, uses global state)
763
 *
764
 * Returns: 0 on success
765
 *
766
 * IMPORTANT: This function signature is part of Python's internal API
767
 * and must not be changed without coordinating with core Python development.
768
 */
769
0
static int perf_map_jit_fini(void* state) {
770
    /*
771
     * Close jitdump file with proper synchronization
772
     *
773
     * We need to acquire the lock to ensure no other threads are
774
     * writing to the file when we close it. This prevents corruption
775
     * and ensures all data is properly flushed.
776
     */
777
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
778
0
    if (perf_jit_map_state.perf_map != NULL) {
779
0
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
780
0
        perf_jit_map_state.perf_map = NULL;
781
0
    }
782
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
783
784
    /*
785
     * Unmap the memory region
786
     *
787
     * This removes the signal to perf that we were generating JIT code.
788
     * After this point, perf will no longer detect this process as
789
     * having JIT capabilities.
790
     */
791
0
    if (perf_jit_map_state.mapped_buffer != NULL) {
792
0
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
793
0
        perf_jit_map_state.mapped_buffer = NULL;
794
0
    }
795
796
    /* Clear global state reference */
797
0
    trampoline_api.state = NULL;
798
799
0
    return 0;  // Success
800
0
}
801
802
// =============================================================================
803
//                              PUBLIC API EXPORT
804
// =============================================================================
805
806
/*
807
 * Python Perf Callbacks Structure
808
 *
809
 * This structure defines the callback interface that Python's trampoline
810
 * system uses to integrate with perf profiling. It contains function
811
 * pointers for initialization, event writing, and cleanup.
812
 *
813
 * CRITICAL: This structure and its contents are part of Python's internal
814
 * API. The function signatures and behavior must remain stable to maintain
815
 * compatibility with the Python interpreter's perf integration system.
816
 *
817
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
818
 */
819
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
820
    &perf_map_jit_init,        // Initialization function
821
    &perf_map_jit_write_entry, // Event writing function
822
    &perf_map_jit_fini,        // Cleanup function
823
};
824
825
#endif /* PY_HAVE_PERF_TRAMPOLINE */