Coverage Report

Created: 2026-06-21 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/perf_jit_trampoline.c
Line
Count
Source
1
/*
2
 * Python Perf Trampoline Support - JIT Dump Implementation
3
 *
4
 * This file implements the perf jitdump API for Python's performance profiling
5
 * integration. It allows perf (Linux performance analysis tool) to understand
6
 * and profile dynamically generated Python bytecode by creating JIT dump files
7
 * that perf can inject into its analysis.
8
 *
9
 *
10
 * IMPORTANT: This file exports specific callback functions that are part of
11
 * Python's internal API. Do not modify the function signatures or behavior
12
 * of exported functions without coordinating with the Python core team.
13
 *
14
 * Usually the binary and libraries are mapped in separate region like below:
15
 *
16
 *   address ->
17
 *    --+---------------------+--//--+---------------------+--
18
 *      | .text | .data | ... |      | .text | .data | ... |
19
 *    --+---------------------+--//--+---------------------+--
20
 *          myprog                      libc.so
21
 *
22
 * So it'd be easy and straight-forward to find a mapped binary or library from an
23
 * address.
24
 *
25
 * But for JIT code, the code arena only cares about the code section. But the
26
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
27
 * unwind info too. Then it'd generate following address space with synthesized
28
 * MMAP events. Let's say it has a sample between address B and C.
29
 *
30
 *                                                sample
31
 *                                                  |
32
 *   address ->                         A       B   v   C
33
 *   ---------------------------------------------------------------------------------------------------
34
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
35
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
36
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
37
 *     ...
38
 *   ---------------------------------------------------------------------------------------------------
39
 *
40
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
41
 * the unwind info. If it maps both .text section and unwind sections, the sample
42
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
43
 * which one is right. So to make perf happy we have non-overlapping ranges for each
44
 * DSO:
45
 *
46
 *   address ->
47
 *   -------------------------------------------------------------------------------------------------------
48
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
49
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
50
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
51
 *     ...
52
 *   -------------------------------------------------------------------------------------------------------
53
 *
54
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
55
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
56
 */
57
58
59
60
#include "Python.h"
61
#include "pycore_ceval.h"         // _PyPerf_Callbacks
62
#include "pycore_frame.h"
63
#include "pycore_interp.h"
64
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
65
#include "pycore_jit_unwind.h"
66
#include "pycore_runtime.h"       // _PyRuntime
67
68
#ifdef PY_HAVE_PERF_TRAMPOLINE
69
70
/* Standard library includes for perf jitdump implementation */
71
#if defined(__linux__)
72
#  include <elf.h>                // ELF architecture constants
73
#endif
74
#include <fcntl.h>                // File control operations
75
#include <stdio.h>                // Standard I/O operations
76
#include <stdlib.h>               // Standard library functions
77
#include <string.h>               // memcpy, strlen
78
#include <sys/mman.h>             // Memory mapping functions (mmap)
79
#include <sys/types.h>            // System data types
80
#include <unistd.h>               // System calls (sysconf, getpid)
81
#include <sys/time.h>             // Time functions (gettimeofday)
82
#if defined(__linux__)
83
#  include <sys/syscall.h>        // System call interface
84
#endif
85
#if defined(__APPLE__)
86
#  include <mach/mach_time.h>     // mach_absolute_time, mach_timebase_info
87
#endif
88
89
// =============================================================================
90
//                           CONSTANTS AND CONFIGURATION
91
// =============================================================================
92
93
/*
94
 * Memory layout considerations for perf jitdump:
95
 *
96
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
97
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
98
 * Shared Object) files that contain:
99
 * - ELF headers
100
 * - .text section (actual machine code)
101
 * - Unwind information (for stack traces)
102
 *
103
 * To ensure proper address space layout, we add padding between code regions.
104
 * This prevents address conflicts when perf maps the synthesized DSOs.
105
 *
106
 * Memory layout example:
107
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
108
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
109
 *
110
 * The padding size is now calculated automatically during initialization
111
 * based on the actual unwind information requirements.
112
 */
113
114
115
/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
116
#if !defined(__linux__)
117
#  if defined(__i386__) || defined(_M_IX86)
118
#    define EM_386      3
119
#  elif defined(__arm__) || defined(_M_ARM)
120
#    define EM_ARM      40
121
#  elif defined(__x86_64__) || defined(_M_X64)
122
#    define EM_X86_64   62
123
#  elif defined(__aarch64__)
124
#    define EM_AARCH64  183
125
#  elif defined(__riscv)
126
#    define EM_RISCV    243
127
#  endif
128
#endif
129
130
/* Convenient access to the global trampoline API state */
131
0
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
132
133
/* Type aliases for clarity and portability */
134
typedef uint64_t uword;                    // Word-sized unsigned integer
135
typedef const char* CodeComments;          // Code comment strings
136
137
/* Memory size constants */
138
0
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
139
140
// =============================================================================
141
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
142
// =============================================================================
143
144
/*
145
 * Returns the ELF machine architecture constant for the current platform.
146
 * This is required for the jitdump header to correctly identify the target
147
 * architecture for perf processing.
148
 *
149
 */
150
0
static uint64_t GetElfMachineArchitecture(void) {
151
0
#if defined(__x86_64__) || defined(_M_X64)
152
0
    return EM_X86_64;
153
#elif defined(__i386__) || defined(_M_IX86)
154
    return EM_386;
155
#elif defined(__aarch64__)
156
    return EM_AARCH64;
157
#elif defined(__arm__) || defined(_M_ARM)
158
    return EM_ARM;
159
#elif defined(__riscv)
160
    return EM_RISCV;
161
#else
162
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
163
    return 0;
164
#endif
165
0
}
166
167
// =============================================================================
168
//                           PERF JITDUMP DATA STRUCTURES
169
// =============================================================================
170
171
/*
172
 * Perf jitdump file format structures
173
 *
174
 * These structures define the binary format that perf expects for JIT dump files.
175
 * The format is documented in the Linux perf tools source code and must match
176
 * exactly for proper perf integration.
177
 */
178
179
/*
180
 * Jitdump file header - written once at the beginning of each jitdump file
181
 * Contains metadata about the process and jitdump format version
182
 */
183
typedef struct {
184
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
185
    uint32_t version;            // Jitdump format version (currently 1)
186
    uint32_t size;               // Size of this header structure
187
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
188
    uint32_t reserved;           // Reserved field (must be 0)
189
    uint32_t process_id;         // Process ID of the JIT compiler
190
    uint64_t time_stamp;         // Timestamp when jitdump was created
191
    uint64_t flags;              // Feature flags (currently unused)
192
} Header;
193
194
/*
195
 * Perf event types supported by the jitdump format
196
 * Each event type has a corresponding structure format
197
 */
198
enum PerfEvent {
199
    PerfLoad = 0,           // Code load event (new JIT function)
200
    PerfMove = 1,           // Code move event (function relocated)
201
    PerfDebugInfo = 2,      // Debug information event
202
    PerfClose = 3,          // JIT session close event
203
    PerfUnwindingInfo = 4   // Stack unwinding information event
204
};
205
206
/*
207
 * Base event structure - common header for all perf events
208
 * Every event in the jitdump file starts with this structure
209
 */
210
struct BaseEvent {
211
    uint32_t event;         // Event type (from PerfEvent enum)
212
    uint32_t size;          // Total size of this event including payload
213
    uint64_t time_stamp;    // Timestamp when event occurred
214
};
215
216
/*
217
 * Code load event - indicates a new JIT-compiled function is available
218
 * This is the most important event type for Python profiling
219
 */
220
typedef struct {
221
    struct BaseEvent base;   // Common event header
222
    uint32_t process_id;     // Process ID where code was generated
223
    uint32_t thread_id;      // Thread ID where code was generated
224
    uint64_t vma;            // Virtual memory address where code is loaded
225
    uint64_t code_address;   // Address of the actual machine code
226
    uint64_t code_size;      // Size of the machine code in bytes
227
    uint64_t code_id;        // Unique identifier for this code region
228
    /* Followed by:
229
     * - null-terminated function name string
230
     * - raw machine code bytes
231
     */
232
} CodeLoadEvent;
233
234
/*
235
 * Code unwinding information event - provides DWARF data for stack traces
236
 * Essential for proper stack unwinding during profiling
237
 */
238
typedef struct {
239
    struct BaseEvent base;      // Common event header
240
    uint64_t unwind_data_size;  // Size of the unwinding data
241
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
242
    uint64_t mapped_size;       // Total mapped size (with padding)
243
    /* Followed by:
244
     * - EH frame header
245
     * - DWARF unwinding information
246
     * - Padding to alignment boundary
247
     */
248
} CodeUnwindingInfoEvent;
249
250
/*
251
 * EH Frame Header structure for DWARF unwinding
252
 *
253
 * This header provides metadata about the .eh_frame data that follows.
254
 * It uses PC-relative and data-relative encodings to keep the synthesized
255
 * DSO self-contained when perf injects it.
256
 */
257
typedef struct __attribute__((packed)) {
258
    uint8_t version;
259
    uint8_t eh_frame_ptr_enc;
260
    uint8_t fde_count_enc;
261
    uint8_t table_enc;
262
    int32_t eh_frame_ptr;
263
    uint32_t eh_fde_count;
264
    int32_t from;
265
    int32_t to;
266
} EhFrameHeader;
267
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");
268
269
// =============================================================================
270
//                              GLOBAL STATE MANAGEMENT
271
// =============================================================================
272
273
/*
274
 * Global state for the perf jitdump implementation
275
 *
276
 * This structure maintains all the state needed for generating jitdump files.
277
 * It's designed as a singleton since there's typically only one jitdump file
278
 * per Python process.
279
 */
280
typedef struct {
281
    FILE* perf_map;          // File handle for the jitdump file
282
    PyMutex map_lock;        // Thread synchronization lock
283
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
284
    size_t mapped_size;      // Size of the mapped region
285
    uint32_t code_id;        // Counter for unique code region identifiers
286
    uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
287
} PerfMapJitState;
288
289
/* Global singleton instance */
290
static PerfMapJitState perf_jit_map_state;
291
292
// =============================================================================
293
//                              TIME UTILITIES
294
// =============================================================================
295
296
/* Time conversion constant */
297
#if !defined(__APPLE__)
298
static const intptr_t nanoseconds_per_second = 1000000000;
299
#endif
300
301
/*
302
 * Get current monotonic time in nanoseconds
303
 *
304
 * Monotonic time is preferred for event timestamps because it's not affected
305
 * by system clock adjustments. This ensures consistent timing relationships
306
 * between events even if the system clock is changed.
307
 *
308
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
309
 */
310
0
static int64_t get_current_monotonic_ticks(void) {
311
#if defined(__APPLE__)
312
    // On macOS the jitdump file is consumed by profilers (such as samply) that
313
    // timestamp their samples using mach_absolute_time(). The jitdump event
314
    // timestamps must use the same clock domain, otherwise the JIT code
315
    // mappings cannot be lined up with the samples.
316
    static mach_timebase_info_data_t timebase = {0, 0};
317
    if (timebase.denom == 0) {
318
        (void)mach_timebase_info(&timebase);
319
    }
320
    uint64_t ticks = mach_absolute_time();
321
    return (int64_t)(ticks * timebase.numer / timebase.denom);
322
#else
323
0
    struct timespec ts;
324
0
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
325
0
        Py_UNREACHABLE();  // Should never fail on supported systems
326
0
        return 0;
327
0
    }
328
329
    /* Convert to nanoseconds for maximum precision */
330
0
    int64_t result = ts.tv_sec;
331
0
    result *= nanoseconds_per_second;
332
0
    result += ts.tv_nsec;
333
0
    return result;
334
0
#endif
335
0
}
336
337
/*
338
 * Get current wall clock time in microseconds
339
 *
340
 * Used for the jitdump file header timestamp. Unlike monotonic time,
341
 * this represents actual wall clock time that can be correlated with
342
 * other system events.
343
 *
344
 * Returns: Current time in microseconds since Unix epoch
345
 */
346
0
static int64_t get_current_time_microseconds(void) {
347
0
    struct timeval tv;
348
0
    if (gettimeofday(&tv, NULL) < 0) {
349
0
        Py_UNREACHABLE();  // Should never fail on supported systems
350
0
        return 0;
351
0
    }
352
0
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
353
0
}
354
355
// =============================================================================
356
//                              FILE I/O UTILITIES
357
// =============================================================================
358
359
/*
360
 * Write data to the jitdump file with error handling
361
 *
362
 * This function ensures that all data is written to the file, handling
363
 * partial writes that can occur with large buffers or when the system
364
 * is under load.
365
 *
366
 * Args:
367
 *   buffer: Pointer to data to write
368
 *   size: Number of bytes to write
369
 */
370
0
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
371
0
    FILE* out_file = perf_jit_map_state.perf_map;
372
0
    const char* ptr = (const char*)(buffer);
373
374
0
    while (size > 0) {
375
0
        const size_t written = fwrite(ptr, 1, size, out_file);
376
0
        if (written == 0) {
377
0
            Py_UNREACHABLE();  // Write failure - should be very rare
378
0
            break;
379
0
        }
380
0
        size -= written;
381
0
        ptr += written;
382
0
    }
383
0
}
384
385
/*
386
 * Write the jitdump file header
387
 *
388
 * The header must be written exactly once at the beginning of each jitdump
389
 * file. It provides metadata that perf uses to parse the rest of the file.
390
 *
391
 * Args:
392
 *   pid: Process ID to include in the header
393
 *   out_file: File handle to write to (currently unused, uses global state)
394
 */
395
0
static void perf_map_jit_write_header(int pid, FILE* out_file) {
396
0
    Header header;
397
398
    /* Initialize header with required values */
399
0
    header.magic = 0x4A695444;                    // "JiTD" magic number
400
0
    header.version = 1;                           // Current jitdump version
401
0
    header.size = sizeof(Header);                 // Header size for validation
402
0
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
403
0
    header.reserved = 0;                          // padding reserved for future use
404
0
    header.process_id = pid;                      // Process identifier
405
0
    header.time_stamp = get_current_time_microseconds();   // Creation time
406
0
    header.flags = 0;                             // No special flags currently used
407
408
0
    perf_map_jit_write_fully(&header, sizeof(header));
409
0
}
410
411
// =============================================================================
412
//                              JITDUMP INITIALIZATION
413
// =============================================================================
414
415
/*
416
 * Initialize the perf jitdump interface
417
 *
418
 * This function sets up everything needed to generate jitdump files:
419
 * 1. Creates the jitdump file with a unique name
420
 * 2. Maps the first page to signal perf that we're using the interface
421
 * 3. Writes the jitdump header
422
 * 4. Initializes synchronization primitives
423
 *
424
 * The memory mapping is crucial - perf detects jitdump files by scanning
425
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
426
 *
427
 * Returns: Pointer to initialized state, or NULL on failure
428
 */
429
0
static void* perf_map_jit_init(void) {
430
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
431
0
    if (perf_jit_map_state.perf_map != NULL) {
432
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
433
0
        return &perf_jit_map_state;
434
0
    }
435
436
0
    char filename[100];
437
0
    int pid = getpid();
438
439
    /* Create unique filename based on process ID */
440
0
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
441
442
    /* Create/open the jitdump file with appropriate permissions */
443
0
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
444
0
    if (fd == -1) {
445
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
446
0
        return NULL;  // Failed to create file
447
0
    }
448
449
    /* Get system page size for memory mapping */
450
0
    const long page_size = sysconf(_SC_PAGESIZE);
451
0
    if (page_size == -1) {
452
0
        close(fd);
453
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
454
0
        return NULL;  // Failed to get page size
455
0
    }
456
457
#if defined(__APPLE__)
458
    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
459
    perf_jit_map_state.mapped_buffer = NULL;
460
#else
461
    /*
462
     * Map the first page of the jitdump file
463
     *
464
     * This memory mapping serves as a signal to perf that this process
465
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
466
     * files that match the jitdump naming pattern.
467
     *
468
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
469
     */
470
0
    perf_jit_map_state.mapped_buffer = mmap(
471
0
        NULL,                    // Let kernel choose address
472
0
        page_size,               // Map one page
473
0
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
474
0
        MAP_PRIVATE,             // Private mapping
475
0
        fd,                      // File descriptor
476
0
        0                        // Offset 0 (first page)
477
0
    );
478
479
0
    if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
480
0
        perf_jit_map_state.mapped_buffer = NULL;
481
0
        close(fd);
482
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
483
0
        return NULL;  // Memory mapping failed
484
0
    }
485
0
    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
486
0
                               "cpython:perf_jit_trampoline");
487
0
#endif
488
489
0
    perf_jit_map_state.mapped_size = page_size;
490
491
    /* Convert file descriptor to FILE* for easier I/O operations */
492
0
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
493
0
    if (perf_jit_map_state.perf_map == NULL) {
494
0
        close(fd);
495
0
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
496
0
        return NULL;  // Failed to create FILE*
497
0
    }
498
499
    /*
500
     * Set up file buffering for better performance
501
     *
502
     * We use a large buffer (2MB) because jitdump files can be written
503
     * frequently during program execution. Buffering reduces system call
504
     * overhead and improves overall performance.
505
     */
506
0
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
507
508
    /* Write the jitdump file header */
509
0
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
510
511
    /* Initialize code ID counter */
512
0
    perf_jit_map_state.code_id = 0;
513
0
    perf_jit_map_state.build_id_salt =
514
0
        ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();
515
516
    /* Calculate padding size based on actual unwind info requirements */
517
0
    size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
518
0
    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
519
0
    trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
520
0
    trampoline_api.code_alignment = 32;
521
522
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
523
0
    return &perf_jit_map_state;
524
0
}
525
526
// =============================================================================
527
//                              MAIN JITDUMP ENTRY WRITING
528
// =============================================================================
529
530
/*
531
 * Write a complete jitdump entry for a code region with a provided name.
532
 *
533
 * This shares the same implementation as the trampoline callback, but
534
 * allows callers that don't have a PyCodeObject to reuse the jitdump
535
 * infrastructure.
536
 */
537
static void perf_map_jit_write_entry_with_name(
538
    void *state,
539
    const void *code_addr,
540
    size_t code_size,
541
    const char *entry,
542
    const char *filename
543
)
544
0
{
545
    /* Initialize jitdump system on first use */
546
0
    void* ret = perf_map_jit_init();
547
0
    if (ret == NULL) {
548
0
        return;  // Initialization failed, silently abort
549
0
    }
550
551
0
    if (entry == NULL) {
552
0
        entry = "";
553
0
    }
554
0
    if (filename == NULL) {
555
0
        filename = "";
556
0
    }
557
558
    /*
559
     * Create formatted function name for perf display
560
     *
561
     * Format: "py::<function_name>:<filename>"
562
     * The "py::" prefix helps identify Python functions in mixed-language
563
     * profiles (e.g., when profiling C extensions alongside Python code).
564
     */
565
0
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
566
0
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
567
0
    if (perf_map_entry == NULL) {
568
0
        return;  // Memory allocation failed
569
0
    }
570
0
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
571
572
0
    const size_t name_length = strlen(perf_map_entry);
573
0
    uword base = (uword)code_addr;
574
0
    uword size = code_size;
575
576
    /*
577
     * Generate DWARF unwinding information
578
     *
579
     * DWARF data is essential for proper stack unwinding during profiling.
580
     * Without it, perf cannot generate accurate call graphs, especially
581
     * in optimized code where frame pointers may be omitted.
582
     */
583
0
    uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
584
0
    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
585
0
        buffer, sizeof(buffer), code_addr, code_size, 0);
586
0
    if (eh_frame_size == 0) {
587
0
        PyMem_RawFree(perf_map_entry);
588
0
        return;
589
0
    }
590
591
    /*
592
     * A logical jitdump entry is written as multiple records and also consumes
593
     * a process-global code_id. Serialize the whole sequence so concurrent JIT
594
     * compilation cannot interleave records or reuse an ID.
595
     */
596
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
597
598
    /*
599
     * Write Code Unwinding Information Event
600
     *
601
     * This event must be written before the code load event to ensure
602
     * perf has the unwinding information available when it processes
603
     * the code region.
604
     */
605
0
    CodeUnwindingInfoEvent ev2;
606
0
    ev2.base.event = PerfUnwindingInfo;
607
0
    ev2.base.time_stamp = get_current_monotonic_ticks();
608
0
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
609
610
    /* Verify we don't exceed our padding budget */
611
0
    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
612
613
0
    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
614
0
    ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment
615
616
    /* Calculate total event size with padding */
617
0
    int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
618
0
    int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
619
0
    ev2.base.size = (uint32_t)(content_size + padding_size);
620
621
    /* Write the unwinding info event header */
622
0
    perf_map_jit_write_fully(&ev2, sizeof(ev2));
623
624
    /*
625
     * Write EH Frame Header
626
     *
627
     * The EH frame header provides metadata about the DWARF unwinding
628
     * information that follows. It includes pointers and counts that
629
     * help perf navigate the unwinding data efficiently.
630
     */
631
0
    EhFrameHeader f;
632
0
    f.version = 1;
633
0
    f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
634
0
    f.fde_count_enc = DWRF_EH_PE_udata4;
635
0
    f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;
636
637
    /* Calculate relative offsets for EH frame navigation */
638
0
    f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
639
0
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
640
0
    f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
641
0
    uint32_t cie_payload_size;
642
0
    memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
643
0
    int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
644
0
    f.to = -(int32_t)(eh_frame_size - cie_size);
645
646
    /* Write EH frame data and header */
647
0
    perf_map_jit_write_fully(buffer, eh_frame_size);
648
0
    perf_map_jit_write_fully(&f, sizeof(f));
649
650
    /* Write padding to maintain alignment */
651
0
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
652
0
    perf_map_jit_write_fully(&padding_bytes, padding_size);
653
654
    /*
655
     * Write Code Load Event
656
     *
657
     * This event tells perf about the new code region. It includes:
658
     * - Memory addresses and sizes
659
     * - Process and thread identification
660
     * - Function name for symbol resolution
661
     * - The actual machine code bytes
662
     */
663
0
    CodeLoadEvent ev;
664
0
    ev.base.event = PerfLoad;
665
0
    ev.base.size = sizeof(ev) + (name_length+1) + size;
666
0
    ev.base.time_stamp = get_current_monotonic_ticks();
667
0
    ev.process_id = getpid();
668
#if defined(__APPLE__)
669
    // The jitdump format defines the thread id field as a 32-bit value, but
670
    // pthread_threadid_np() returns a 64-bit id. Truncate it to 32 bits to
671
    // keep the record layout identical to other platforms.
672
    uint64_t thread_id = 0;
673
    pthread_threadid_np(NULL, &thread_id);
674
    ev.thread_id = (uint32_t)thread_id;
675
#else
676
0
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
677
0
#endif
678
0
    ev.vma = base;                       // Virtual memory address
679
0
    ev.code_address = base;              // Same as VMA for our use case
680
0
    ev.code_size = size;
681
682
    /* Assign unique code ID and increment counter */
683
0
    perf_jit_map_state.code_id += 1;
684
0
    ev.code_id = perf_jit_map_state.code_id;
685
686
    /* Write code load event and associated data */
687
0
    perf_map_jit_write_fully(&ev, sizeof(ev));
688
0
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
689
    /*
690
     * Ensure each synthetic DSO has unique .text bytes.
691
     *
692
     * perf merges DSOs that share a build-id. Since trampolines can share
693
     * identical code and unwind bytes, perf may resolve all JIT frames to
694
     * the first symbol it saw (including entries from previous runs when
695
     * build-id caching is enabled). Patch a small marker in the emitted
696
     * bytes to make the build-id depend on a per-process salt and code id
697
     * without modifying the live code.
698
     */
699
0
    uint64_t marker = perf_jit_map_state.build_id_salt ^
700
0
        ((uint64_t)perf_jit_map_state.code_id << 32) ^
701
0
        (uint64_t)code_size;
702
0
    if (size >= sizeof(marker)) {
703
0
        size_t prefix = size - sizeof(marker);
704
0
        perf_map_jit_write_fully((void *)(base), prefix);
705
0
        perf_map_jit_write_fully(&marker, sizeof(marker));
706
0
    }
707
0
    else if (size > 0) {
708
0
        uint8_t tmp[sizeof(marker)];
709
0
        memcpy(tmp, (void *)(base), size);
710
0
        for (size_t i = 0; i < size; i++) {
711
0
            tmp[i] ^= (uint8_t)(marker >> (i * 8));
712
0
        }
713
0
        perf_map_jit_write_fully(tmp, size);
714
0
    }
715
716
    /* Clean up allocated memory */
717
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
718
0
    PyMem_RawFree(perf_map_entry);
719
0
}
720
721
/*
722
 * Write a complete jitdump entry for a Python function
723
 *
724
 * This is the main function called by Python's trampoline system whenever
725
 * a new piece of JIT-compiled code needs to be recorded. It writes both
726
 * the unwinding information and the code load event to the jitdump file.
727
 *
728
 * The function performs these steps:
729
 * 1. Initialize jitdump system if not already done
730
 * 2. Extract function name and filename from Python code object
731
 * 3. Generate DWARF unwinding information
732
 * 4. Write unwinding info event to jitdump file
733
 * 5. Write code load event to jitdump file
734
 *
735
 * Args:
736
 *   state: Jitdump state (currently unused, uses global state)
737
 *   code_addr: Address where the compiled code resides
738
 *   code_size: Size of the compiled code in bytes
739
 *   co: Python code object containing metadata
740
 *
741
 * IMPORTANT: This function signature is part of Python's internal API
742
 * and must not be changed without coordinating with core Python development.
743
 */
744
static void perf_map_jit_write_entry(void *state, const void *code_addr,
745
                                     size_t code_size, PyCodeObject *co)
746
0
{
747
0
    const char *entry = "";
748
0
    const char *filename = "";
749
0
    if (co != NULL) {
750
0
        if (co->co_qualname != NULL) {
751
0
            entry = PyUnicode_AsUTF8(co->co_qualname);
752
0
        }
753
0
        if (co->co_filename != NULL) {
754
0
            filename = PyUnicode_AsUTF8(co->co_filename);
755
0
        }
756
0
    }
757
0
    perf_map_jit_write_entry_with_name(state, code_addr, code_size,
758
0
                                       entry, filename);
759
0
}
760
761
void
762
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
763
                          const char *entry, const char *filename)
764
0
{
765
0
    perf_map_jit_write_entry_with_name(
766
0
        NULL, code_addr, code_size, entry, filename);
767
0
}
768
769
// =============================================================================
770
//                              CLEANUP AND FINALIZATION
771
// =============================================================================
772
773
/*
774
 * Finalize and cleanup the perf jitdump system
775
 *
776
 * This function is called when Python is shutting down or when the
777
 * perf trampoline system is being disabled. It ensures all resources
778
 * are properly released and all buffered data is flushed to disk.
779
 *
780
 * Args:
781
 *   state: Jitdump state (currently unused, uses global state)
782
 *
783
 * Returns: 0 on success
784
 *
785
 * IMPORTANT: This function signature is part of Python's internal API
786
 * and must not be changed without coordinating with core Python development.
787
 */
788
0
static int perf_map_jit_fini(void* state) {
789
    /*
790
     * Close jitdump file with proper synchronization
791
     *
792
     * We need to acquire the lock to ensure no other threads are
793
     * writing to the file when we close it. This prevents corruption
794
     * and ensures all data is properly flushed.
795
     */
796
0
    PyMutex_Lock(&perf_jit_map_state.map_lock);
797
0
    if (perf_jit_map_state.perf_map != NULL) {
798
0
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
799
0
        perf_jit_map_state.perf_map = NULL;
800
0
    }
801
0
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
802
803
    /*
804
     * Unmap the memory region
805
     *
806
     * This removes the signal to perf that we were generating JIT code.
807
     * After this point, perf will no longer detect this process as
808
     * having JIT capabilities.
809
     */
810
0
    if (perf_jit_map_state.mapped_buffer != NULL) {
811
0
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
812
0
        perf_jit_map_state.mapped_buffer = NULL;
813
0
    }
814
815
    /* Clear global state reference */
816
0
    trampoline_api.state = NULL;
817
818
0
    return 0;  // Success
819
0
}
820
821
// =============================================================================
822
//                              PUBLIC API EXPORT
823
// =============================================================================
824
825
/*
826
 * Python Perf Callbacks Structure
827
 *
828
 * This structure defines the callback interface that Python's trampoline
829
 * system uses to integrate with perf profiling. It contains function
830
 * pointers for initialization, event writing, and cleanup.
831
 *
832
 * CRITICAL: This structure and its contents are part of Python's internal
833
 * API. The function signatures and behavior must remain stable to maintain
834
 * compatibility with the Python interpreter's perf integration system.
835
 *
836
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
837
 */
838
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
839
    &perf_map_jit_init,        // Initialization function
840
    &perf_map_jit_write_entry, // Event writing function
841
    &perf_map_jit_fini,        // Cleanup function
842
};
843
844
#endif /* PY_HAVE_PERF_TRAMPOLINE */