Coverage Report

Created: 2026-02-26 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/perf_jit_trampoline.c
Line
Count
Source
1
/*
2
 * Python Perf Trampoline Support - JIT Dump Implementation
3
 *
4
 * This file implements the perf jitdump API for Python's performance profiling
5
 * integration. It allows perf (Linux performance analysis tool) to understand
6
 * and profile dynamically generated Python bytecode by creating JIT dump files
7
 * that perf can inject into its analysis.
8
 *
9
 *
10
 * IMPORTANT: This file exports specific callback functions that are part of
11
 * Python's internal API. Do not modify the function signatures or behavior
12
 * of exported functions without coordinating with the Python core team.
13
 *
14
 * Usually the binary and libraries are mapped in separate region like below:
15
 *
16
 *   address ->
17
 *    --+---------------------+--//--+---------------------+--
18
 *      | .text | .data | ... |      | .text | .data | ... |
19
 *    --+---------------------+--//--+---------------------+--
20
 *          myprog                      libc.so
21
 *
22
 * So it'd be easy and straight-forward to find a mapped binary or library from an
23
 * address.
24
 *
25
 * But for JIT code, the code arena only cares about the code section. But the
26
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
27
 * unwind info too. Then it'd generate following address space with synthesized
28
 * MMAP events. Let's say it has a sample between address B and C.
29
 *
30
 *                                                sample
31
 *                                                  |
32
 *   address ->                         A       B   v   C
33
 *   ---------------------------------------------------------------------------------------------------
34
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
35
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
36
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
37
 *     ...
38
 *   ---------------------------------------------------------------------------------------------------
39
 *
40
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
41
 * the unwind info. If it maps both .text section and unwind sections, the sample
42
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
43
 * which one is right. So to make perf happy we have non-overlapping ranges for each
44
 * DSO:
45
 *
46
 *   address ->
47
 *   -------------------------------------------------------------------------------------------------------
48
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
49
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
50
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
51
 *     ...
52
 *   -------------------------------------------------------------------------------------------------------
53
 *
54
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
55
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
56
 */
57
58
59
60
#include "Python.h"
61
#include "pycore_ceval.h"         // _PyPerf_Callbacks
62
#include "pycore_frame.h"
63
#include "pycore_interp.h"
64
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
65
#include "pycore_runtime.h"       // _PyRuntime
66
67
#ifdef PY_HAVE_PERF_TRAMPOLINE
68
69
/* Standard library includes for perf jitdump implementation */
70
#if defined(__linux__)
71
#  include <elf.h>                // ELF architecture constants
72
#endif
73
#include <fcntl.h>                // File control operations
74
#include <stdio.h>                // Standard I/O operations
75
#include <stdlib.h>               // Standard library functions
76
#include <sys/mman.h>             // Memory mapping functions (mmap)
77
#include <sys/types.h>            // System data types
78
#include <unistd.h>               // System calls (sysconf, getpid)
79
#include <sys/time.h>             // Time functions (gettimeofday)
80
#if defined(__linux__)
81
#  include <sys/syscall.h>        // System call interface
82
#endif
83
84
// =============================================================================
85
//                           CONSTANTS AND CONFIGURATION
86
// =============================================================================
87
88
/*
89
 * Memory layout considerations for perf jitdump:
90
 *
91
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
92
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
93
 * Shared Object) files that contain:
94
 * - ELF headers
95
 * - .text section (actual machine code)
96
 * - Unwind information (for stack traces)
97
 *
98
 * To ensure proper address space layout, we add padding between code regions.
99
 * This prevents address conflicts when perf maps the synthesized DSOs.
100
 *
101
 * Memory layout example:
102
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
103
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
104
 *
105
 * The padding size is now calculated automatically during initialization
106
 * based on the actual unwind information requirements.
107
 */
108
109
110
/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
111
#if !defined(__linux__)
112
#  if defined(__i386__) || defined(_M_IX86)
113
#    define EM_386      3
114
#  elif defined(__arm__) || defined(_M_ARM)
115
#    define EM_ARM      40
116
#  elif defined(__x86_64__) || defined(_M_X64)
117
#    define EM_X86_64   62
118
#  elif defined(__aarch64__)
119
#    define EM_AARCH64  183
120
#  elif defined(__riscv)
121
#    define EM_RISCV    243
122
#  endif
123
#endif
124
125
/* Convenient access to the global trampoline API state */
126
0
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
127
128
/* Type aliases for clarity and portability */
129
typedef uint64_t uword;                    // Word-sized unsigned integer
130
typedef const char* CodeComments;          // Code comment strings
131
132
/* Memory size constants */
133
0
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing
134
135
// =============================================================================
136
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
137
// =============================================================================
138
139
/*
140
 * Returns the ELF machine architecture constant for the current platform.
141
 * This is required for the jitdump header to correctly identify the target
142
 * architecture for perf processing.
143
 *
144
 */
145
0
static uint64_t GetElfMachineArchitecture(void) {
146
0
#if defined(__x86_64__) || defined(_M_X64)
147
0
    return EM_X86_64;
148
#elif defined(__i386__) || defined(_M_IX86)
149
    return EM_386;
150
#elif defined(__aarch64__)
151
    return EM_AARCH64;
152
#elif defined(__arm__) || defined(_M_ARM)
153
    return EM_ARM;
154
#elif defined(__riscv)
155
    return EM_RISCV;
156
#else
157
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
158
    return 0;
159
#endif
160
0
}
161
162
// =============================================================================
163
//                           PERF JITDUMP DATA STRUCTURES
164
// =============================================================================
165
166
/*
167
 * Perf jitdump file format structures
168
 *
169
 * These structures define the binary format that perf expects for JIT dump files.
170
 * The format is documented in the Linux perf tools source code and must match
171
 * exactly for proper perf integration.
172
 */
173
174
/*
175
 * Jitdump file header - written once at the beginning of each jitdump file
176
 * Contains metadata about the process and jitdump format version
177
 */
178
typedef struct {
179
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
180
    uint32_t version;            // Jitdump format version (currently 1)
181
    uint32_t size;               // Size of this header structure
182
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
183
    uint32_t reserved;           // Reserved field (must be 0)
184
    uint32_t process_id;         // Process ID of the JIT compiler
185
    uint64_t time_stamp;         // Timestamp when jitdump was created
186
    uint64_t flags;              // Feature flags (currently unused)
187
} Header;
188
189
/*
190
 * Perf event types supported by the jitdump format
191
 * Each event type has a corresponding structure format
192
 */
193
enum PerfEvent {
194
    PerfLoad = 0,           // Code load event (new JIT function)
195
    PerfMove = 1,           // Code move event (function relocated)
196
    PerfDebugInfo = 2,      // Debug information event
197
    PerfClose = 3,          // JIT session close event
198
    PerfUnwindingInfo = 4   // Stack unwinding information event
199
};
200
201
/*
202
 * Base event structure - common header for all perf events
203
 * Every event in the jitdump file starts with this structure
204
 */
205
struct BaseEvent {
206
    uint32_t event;         // Event type (from PerfEvent enum)
207
    uint32_t size;          // Total size of this event including payload
208
    uint64_t time_stamp;    // Timestamp when event occurred
209
};
210
211
/*
212
 * Code load event - indicates a new JIT-compiled function is available
213
 * This is the most important event type for Python profiling
214
 */
215
typedef struct {
216
    struct BaseEvent base;   // Common event header
217
    uint32_t process_id;     // Process ID where code was generated
218
#if defined(__APPLE__)
219
    uint64_t thread_id;      // Thread ID where code was generated
220
#else
221
    uint32_t thread_id;      // Thread ID where code was generated
222
#endif
223
    uint64_t vma;            // Virtual memory address where code is loaded
224
    uint64_t code_address;   // Address of the actual machine code
225
    uint64_t code_size;      // Size of the machine code in bytes
226
    uint64_t code_id;        // Unique identifier for this code region
227
    /* Followed by:
228
     * - null-terminated function name string
229
     * - raw machine code bytes
230
     */
231
} CodeLoadEvent;
232
233
/*
234
 * Code unwinding information event - provides DWARF data for stack traces
235
 * Essential for proper stack unwinding during profiling
236
 */
237
typedef struct {
238
    struct BaseEvent base;      // Common event header
239
    uint64_t unwind_data_size;  // Size of the unwinding data
240
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
241
    uint64_t mapped_size;       // Total mapped size (with padding)
242
    /* Followed by:
243
     * - EH frame header
244
     * - DWARF unwinding information
245
     * - Padding to alignment boundary
246
     */
247
} CodeUnwindingInfoEvent;
248
249
// =============================================================================
250
//                              GLOBAL STATE MANAGEMENT
251
// =============================================================================
252
253
/*
254
 * Global state for the perf jitdump implementation
255
 *
256
 * This structure maintains all the state needed for generating jitdump files.
257
 * It's designed as a singleton since there's typically only one jitdump file
258
 * per Python process.
259
 */
260
typedef struct {
261
    FILE* perf_map;          // File handle for the jitdump file
262
    PyThread_type_lock map_lock;  // Thread synchronization lock
263
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
264
    size_t mapped_size;      // Size of the mapped region
265
    int code_id;             // Counter for unique code region identifiers
266
} PerfMapJitState;
267
268
/* Global singleton instance */
269
static PerfMapJitState perf_jit_map_state;
270
271
// =============================================================================
272
//                              TIME UTILITIES
273
// =============================================================================
274
275
/* Time conversion constant */
276
static const intptr_t nanoseconds_per_second = 1000000000;
277
278
/*
279
 * Get current monotonic time in nanoseconds
280
 *
281
 * Monotonic time is preferred for event timestamps because it's not affected
282
 * by system clock adjustments. This ensures consistent timing relationships
283
 * between events even if the system clock is changed.
284
 *
285
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
286
 */
287
0
static int64_t get_current_monotonic_ticks(void) {
288
0
    struct timespec ts;
289
0
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
290
0
        Py_UNREACHABLE();  // Should never fail on supported systems
291
0
        return 0;
292
0
    }
293
294
    /* Convert to nanoseconds for maximum precision */
295
0
    int64_t result = ts.tv_sec;
296
0
    result *= nanoseconds_per_second;
297
0
    result += ts.tv_nsec;
298
0
    return result;
299
0
}
300
301
/*
302
 * Get current wall clock time in microseconds
303
 *
304
 * Used for the jitdump file header timestamp. Unlike monotonic time,
305
 * this represents actual wall clock time that can be correlated with
306
 * other system events.
307
 *
308
 * Returns: Current time in microseconds since Unix epoch
309
 */
310
0
static int64_t get_current_time_microseconds(void) {
311
0
    struct timeval tv;
312
0
    if (gettimeofday(&tv, NULL) < 0) {
313
0
        Py_UNREACHABLE();  // Should never fail on supported systems
314
0
        return 0;
315
0
    }
316
0
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
317
0
}
318
319
// =============================================================================
320
//                              UTILITY FUNCTIONS
321
// =============================================================================
322
323
/*
324
 * Round up a value to the next multiple of a given number
325
 *
326
 * This is essential for maintaining proper alignment requirements in the
327
 * jitdump format. Many structures need to be aligned to specific boundaries
328
 * (typically 8 or 16 bytes) for efficient processing by perf.
329
 *
330
 * Args:
331
 *   value: The value to round up
332
 *   multiple: The multiple to round up to
333
 *
334
 * Returns: The smallest value >= input that is a multiple of 'multiple'
335
 */
336
0
static size_t round_up(int64_t value, int64_t multiple) {
337
0
    if (multiple == 0) {
338
0
        return value;  // Avoid division by zero
339
0
    }
340
341
0
    int64_t remainder = value % multiple;
342
0
    if (remainder == 0) {
343
0
        return value;  // Already aligned
344
0
    }
345
346
    /* Calculate how much to add to reach the next multiple */
347
0
    int64_t difference = multiple - remainder;
348
0
    int64_t rounded_up_value = value + difference;
349
350
0
    return rounded_up_value;
351
0
}
352
353
// =============================================================================
354
//                              FILE I/O UTILITIES
355
// =============================================================================
356
357
/*
358
 * Write data to the jitdump file with error handling
359
 *
360
 * This function ensures that all data is written to the file, handling
361
 * partial writes that can occur with large buffers or when the system
362
 * is under load.
363
 *
364
 * Args:
365
 *   buffer: Pointer to data to write
366
 *   size: Number of bytes to write
367
 */
368
0
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
369
0
    FILE* out_file = perf_jit_map_state.perf_map;
370
0
    const char* ptr = (const char*)(buffer);
371
372
0
    while (size > 0) {
373
0
        const size_t written = fwrite(ptr, 1, size, out_file);
374
0
        if (written == 0) {
375
0
            Py_UNREACHABLE();  // Write failure - should be very rare
376
0
            break;
377
0
        }
378
0
        size -= written;
379
0
        ptr += written;
380
0
    }
381
0
}
382
383
/*
384
 * Write the jitdump file header
385
 *
386
 * The header must be written exactly once at the beginning of each jitdump
387
 * file. It provides metadata that perf uses to parse the rest of the file.
388
 *
389
 * Args:
390
 *   pid: Process ID to include in the header
391
 *   out_file: File handle to write to (currently unused, uses global state)
392
 */
393
0
static void perf_map_jit_write_header(int pid, FILE* out_file) {
394
0
    Header header;
395
396
    /* Initialize header with required values */
397
0
    header.magic = 0x4A695444;                    // "JiTD" magic number
398
0
    header.version = 1;                           // Current jitdump version
399
0
    header.size = sizeof(Header);                 // Header size for validation
400
0
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
401
0
    header.reserved = 0;                          // padding reserved for future use
402
0
    header.process_id = pid;                      // Process identifier
403
0
    header.time_stamp = get_current_time_microseconds();   // Creation time
404
0
    header.flags = 0;                             // No special flags currently used
405
406
0
    perf_map_jit_write_fully(&header, sizeof(header));
407
0
}
408
409
// =============================================================================
410
//                              DWARF CONSTANTS AND UTILITIES
411
// =============================================================================
412
413
/*
414
 * DWARF (Debug With Arbitrary Record Formats) constants
415
 *
416
 * DWARF is a debugging data format used to provide stack unwinding information.
417
 * These constants define the various encoding types and opcodes used in
418
 * DWARF Call Frame Information (CFI) records.
419
 */
420
421
/* DWARF Call Frame Information version */
422
#define DWRF_CIE_VERSION 1
423
424
/* DWARF CFA (Call Frame Address) opcodes */
425
enum {
426
    DWRF_CFA_nop = 0x0,                    // No operation
427
    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
428
    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
429
    DWRF_CFA_def_cfa_register = 0xd,      // Define CFA register
430
    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
431
    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
432
    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
433
    DWRF_CFA_offset = 0x80,               // Simple offset instruction
434
    DWRF_CFA_restore = 0xc0               // Restore register
435
};
436
437
/* DWARF Exception Handling pointer encodings */
438
enum {
439
    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
440
    DWRF_EH_PE_omit = 0xff,               // Omitted value
441
442
    /* Data type encodings */
443
    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
444
    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
445
    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
446
    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
447
    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
448
    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
449
    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
450
    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
451
    DWRF_EH_PE_signed = 0x08,             // Signed flag
452
453
    /* Reference type encodings */
454
    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
455
    DWRF_EH_PE_textrel = 0x20,            // Text-relative
456
    DWRF_EH_PE_datarel = 0x30,            // Data-relative
457
    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
458
    DWRF_EH_PE_aligned = 0x50,            // Aligned
459
    DWRF_EH_PE_indirect = 0x80            // Indirect
460
};
461
462
/* Additional DWARF constants for debug information */
463
enum { DWRF_TAG_compile_unit = 0x11 };
464
enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
465
enum {
466
    DWRF_AT_name = 0x03,         // Name attribute
467
    DWRF_AT_stmt_list = 0x10,    // Statement list
468
    DWRF_AT_low_pc = 0x11,       // Low PC address
469
    DWRF_AT_high_pc = 0x12       // High PC address
470
};
471
enum {
472
    DWRF_FORM_addr = 0x01,       // Address form
473
    DWRF_FORM_data4 = 0x06,      // 4-byte data
474
    DWRF_FORM_string = 0x08      // String form
475
};
476
477
/* Line number program opcodes */
478
enum {
479
    DWRF_LNS_extended_op = 0,    // Extended opcode
480
    DWRF_LNS_copy = 1,           // Copy operation
481
    DWRF_LNS_advance_pc = 2,     // Advance program counter
482
    DWRF_LNS_advance_line = 3    // Advance line number
483
};
484
485
/* Line number extended opcodes */
486
enum {
487
    DWRF_LNE_end_sequence = 1,   // End of sequence
488
    DWRF_LNE_set_address = 2     // Set address
489
};
490
491
/*
492
 * Architecture-specific DWARF register numbers
493
 *
494
 * These constants define the register numbering scheme used by DWARF
495
 * for each supported architecture. The numbers must match the ABI
496
 * specification for proper stack unwinding.
497
 */
498
enum {
499
#ifdef __x86_64__
500
    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
501
    DWRF_REG_AX,    // RAX
502
    DWRF_REG_DX,    // RDX
503
    DWRF_REG_CX,    // RCX
504
    DWRF_REG_BX,    // RBX
505
    DWRF_REG_SI,    // RSI
506
    DWRF_REG_DI,    // RDI
507
    DWRF_REG_BP,    // RBP
508
    DWRF_REG_SP,    // RSP
509
    DWRF_REG_8,     // R8
510
    DWRF_REG_9,     // R9
511
    DWRF_REG_10,    // R10
512
    DWRF_REG_11,    // R11
513
    DWRF_REG_12,    // R12
514
    DWRF_REG_13,    // R13
515
    DWRF_REG_14,    // R14
516
    DWRF_REG_15,    // R15
517
    DWRF_REG_RA,    // Return address (RIP)
518
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
519
    /* AArch64 register numbering */
520
    DWRF_REG_FP = 29,  // Frame Pointer
521
    DWRF_REG_RA = 30,  // Link register (return address)
522
    DWRF_REG_SP = 31,  // Stack pointer
523
#else
524
#    error "Unsupported target architecture"
525
#endif
526
};
527
528
/* DWARF encoding constants used in EH frame headers */
529
static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
530
static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
531
static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
532
static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
533
534
// =============================================================================
535
//                              ELF OBJECT CONTEXT
536
// =============================================================================
537
538
/*
539
 * Context for building ELF/DWARF structures
540
 *
541
 * This structure maintains state while constructing DWARF unwind information.
542
 * It acts as a simple buffer manager with pointers to track current position
543
 * and important landmarks within the buffer.
544
 */
545
typedef struct ELFObjectContext {
546
    uint8_t* p;            // Current write position in buffer
547
    uint8_t* startp;       // Start of buffer (for offset calculations)
548
    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
549
    uint8_t* fde_p;        // Start of FDE data (for PC-relative calculations)
550
    uint32_t code_size;    // Size of the code being described
551
} ELFObjectContext;
552
553
/*
554
 * EH Frame Header structure for DWARF unwinding
555
 *
556
 * This structure provides metadata about the DWARF unwinding information
557
 * that follows. It's required by the perf jitdump format to enable proper
558
 * stack unwinding during profiling.
559
 */
560
typedef struct {
561
    unsigned char version;           // EH frame version (always 1)
562
    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
563
    unsigned char fde_count_enc;     // Encoding of FDE count
564
    unsigned char table_enc;         // Encoding of table entries
565
    int32_t eh_frame_ptr;           // Pointer to EH frame data
566
    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
567
    int32_t from;                   // Start address of code range
568
    int32_t to;                     // End address of code range
569
} EhFrameHeader;
570
571
// =============================================================================
572
//                              DWARF GENERATION UTILITIES
573
// =============================================================================
574
575
/*
576
 * Append a null-terminated string to the ELF context buffer
577
 *
578
 * Args:
579
 *   ctx: ELF object context
580
 *   str: String to append (must be null-terminated)
581
 *
582
 * Returns: Offset from start of buffer where string was written
583
 */
584
0
static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
585
0
    uint8_t* p = ctx->p;
586
0
    uint32_t ofs = (uint32_t)(p - ctx->startp);
587
588
    /* Copy string including null terminator */
589
0
    do {
590
0
        *p++ = (uint8_t)*str;
591
0
    } while (*str++);
592
593
0
    ctx->p = p;
594
0
    return ofs;
595
0
}
596
597
/*
598
 * Append a SLEB128 (Signed Little Endian Base 128) value
599
 *
600
 * SLEB128 is a variable-length encoding used extensively in DWARF.
601
 * It efficiently encodes small numbers in fewer bytes.
602
 *
603
 * Args:
604
 *   ctx: ELF object context
605
 *   v: Signed value to encode
606
 */
607
0
static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
608
0
    uint8_t* p = ctx->p;
609
610
    /* Encode 7 bits at a time, with continuation bit in MSB */
611
0
    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
612
0
        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
613
0
    }
614
0
    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
615
616
0
    ctx->p = p;
617
0
}
618
619
/*
620
 * Append a ULEB128 (Unsigned Little Endian Base 128) value
621
 *
622
 * Similar to SLEB128 but for unsigned values.
623
 *
624
 * Args:
625
 *   ctx: ELF object context
626
 *   v: Unsigned value to encode
627
 */
628
0
static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
629
0
    uint8_t* p = ctx->p;
630
631
    /* Encode 7 bits at a time, with continuation bit in MSB */
632
0
    for (; v >= 0x80; v >>= 7) {
633
0
        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
634
0
    }
635
0
    *p++ = (char)v;  // Final byte without continuation bit
636
637
0
    ctx->p = p;
638
0
}
639
640
/*
641
 * Macros for generating DWARF structures
642
 *
643
 * These macros provide a convenient way to write various data types
644
 * to the DWARF buffer while automatically advancing the pointer.
645
 */
646
#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
647
#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
648
#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
649
#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
650
#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
651
#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
652
#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
653
#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
654
655
/* Align to specified boundary with NOP instructions */
656
#define DWRF_ALIGNNOP(s)                                          \
657
    while ((uintptr_t)p & ((s)-1)) {                              \
658
        *p++ = DWRF_CFA_nop;                                       \
659
    }
660
661
/* Write a DWARF section with automatic size calculation */
662
#define DWRF_SECTION(name, stmt)                                  \
663
0
    {                                                             \
664
0
        uint32_t* szp_##name = (uint32_t*)p;                      \
665
0
        p += 4;                                                   \
666
0
        stmt;                                                     \
667
0
        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
668
0
    }
669
670
// =============================================================================
671
//                              DWARF EH FRAME GENERATION
672
// =============================================================================
673
674
static void elf_init_ehframe(ELFObjectContext* ctx);
675
676
/*
677
 * Initialize DWARF .eh_frame section for a code region
678
 *
679
 * The .eh_frame section contains Call Frame Information (CFI) that describes
680
 * how to unwind the stack at any point in the code. This is essential for
681
 * proper profiling as it allows perf to generate accurate call graphs.
682
 *
683
 * The function generates two main components:
684
 * 1. CIE (Common Information Entry) - describes calling conventions
685
 * 2. FDE (Frame Description Entry) - describes specific function unwinding
686
 *
687
 * Args:
688
 *   ctx: ELF object context containing code size and buffer pointers
689
 */
690
0
static size_t calculate_eh_frame_size(void) {
691
    /* Calculate the EH frame size for the trampoline function */
692
0
    extern void *_Py_trampoline_func_start;
693
0
    extern void *_Py_trampoline_func_end;
694
695
0
    size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start;
696
697
0
    ELFObjectContext ctx;
698
0
    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
699
0
    ctx.code_size = code_size;
700
0
    ctx.startp = ctx.p = (uint8_t*)buffer;
701
0
    ctx.fde_p = NULL;
702
703
0
    elf_init_ehframe(&ctx);
704
0
    return ctx.p - ctx.startp;
705
0
}
706
707
0
static void elf_init_ehframe(ELFObjectContext* ctx) {
708
0
    uint8_t* p = ctx->p;
709
0
    uint8_t* framep = p;  // Remember start of frame data
710
711
    /*
712
    * DWARF Unwind Table for Trampoline Function
713
    *
714
    * This section defines DWARF Call Frame Information (CFI) using encoded macros
715
    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
716
    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
717
    * and debuggers for stack unwinding in JIT-compiled code.
718
    *
719
    * -------------------------------------------------
720
    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
721
    * -------------------------------------------------
722
    *
723
    * 1. Create a trampoline source file (e.g., `trampoline.c`):
724
    *
725
    *      #include <Python.h>
726
    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
727
    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
728
    *          return evaluator(ts, f, throwflag);
729
    *      }
730
    *
731
    * 2. Compile to an object file with frame pointer preservation:
732
    *
733
    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
734
    *
735
    * 3. Extract DWARF unwind info from the object file:
736
    *
737
    *      readelf -w trampoline.o
738
    *
739
    *    Example output from `.eh_frame`:
740
    *
741
    *      00000000 CIE
742
    *        Version:               1
743
    *        Augmentation:          "zR"
744
    *        Code alignment factor: 4
745
    *        Data alignment factor: -8
746
    *        Return address column: 30
747
    *        DW_CFA_def_cfa: r31 (sp) ofs 0
748
    *
749
    *      00000014 FDE cie=00000000 pc=0..14
750
    *        DW_CFA_advance_loc: 4
751
    *        DW_CFA_def_cfa_offset: 16
752
    *        DW_CFA_offset: r29 at cfa-16
753
    *        DW_CFA_offset: r30 at cfa-8
754
    *        DW_CFA_advance_loc: 12
755
    *        DW_CFA_restore: r30
756
    *        DW_CFA_restore: r29
757
    *        DW_CFA_def_cfa_offset: 0
758
    *
759
    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
760
    *
761
    * ----------------------------------
762
    * HOW TO TRANSLATE TO DWRF_* MACROS:
763
    * ----------------------------------
764
    *
765
    * After compiling your trampoline with:
766
    *
767
    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
768
    *
769
    * run:
770
    *
771
    *     readelf -w trampoline.o
772
    *
773
    * to inspect the generated `.eh_frame` data. You will see two main components:
774
    *
775
    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
776
    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
777
    *
778
    * ---------------------
779
    * Translating the CIE:
780
    * ---------------------
781
    * From `readelf -w`, you might see:
782
    *
783
    *   00000000 0000000000000010 00000000 CIE
784
    *     Version:               1
785
    *     Augmentation:          "zR"
786
    *     Code alignment factor: 4
787
    *     Data alignment factor: -8
788
    *     Return address column: 30
789
    *     Augmentation data:     1b
790
    *     DW_CFA_def_cfa: r31 (sp) ofs 0
791
    *
792
    * Map this to:
793
    *
794
    *     DWRF_SECTION(CIE,
795
    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
796
    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
797
    *         DWRF_STR("zR");                         // Augmentation string "zR"
798
    *         DWRF_UV(4);                             // Code alignment factor = 4
799
    *         DWRF_SV(-8);                            // Data alignment factor = -8
800
    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
801
    *         DWRF_UV(1);                             // Augmentation data length = 1
802
    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
803
    *
804
    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
805
    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
806
    *         DWRF_UV(0);                             // Offset = 0
807
    *
808
    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
809
    *     )
810
    *
811
    * Notes:
812
    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
813
    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
814
    *
815
    * ---------------------
816
    * Translating the FDE:
817
    * ---------------------
818
    * From `readelf -w`:
819
    *
820
    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
821
    *     DW_CFA_advance_loc: 4
822
    *     DW_CFA_def_cfa_offset: 16
823
    *     DW_CFA_offset: r29 at cfa-16
824
    *     DW_CFA_offset: r30 at cfa-8
825
    *     DW_CFA_advance_loc: 12
826
    *     DW_CFA_restore: r30
827
    *     DW_CFA_restore: r29
828
    *     DW_CFA_def_cfa_offset: 0
829
    *
830
    * Map the FDE header and instructions to:
831
    *
832
    *     DWRF_SECTION(FDE,
833
    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
834
    *         DWRF_U32(pc_relative_offset);           // PC-relative location of the code (calculated dynamically)
835
    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
836
    *         DWRF_U8(0);                             // Augmentation data length (none)
837
    *
838
    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
839
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
840
    *         DWRF_UV(16);
841
    *
842
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
843
    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
844
    *
845
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
846
    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
847
    *
848
    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
849
    *
850
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
851
    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
852
    *
853
    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
854
    *         DWRF_UV(0);
855
    *     )
856
    *
857
    * To regenerate:
858
    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
859
    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
860
    *      the code is in a different address space every time.
861
    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
862
    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
863
    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
864
    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
865
    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
866
    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
867
    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
868
    */
869
870
    /*
871
     * Emit DWARF EH CIE (Common Information Entry)
872
     *
873
     * The CIE describes the calling conventions and basic unwinding rules
874
     * that apply to all functions in this compilation unit.
875
     */
876
0
    DWRF_SECTION(CIE,
877
0
        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
878
0
        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
879
0
        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
880
0
#ifdef __x86_64__
881
0
        DWRF_UV(1);                           // Code alignment factor (x86_64: 1 byte)
882
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
883
        DWRF_UV(4);                           // Code alignment factor (AArch64: 4 bytes per instruction)
884
#endif
885
0
        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
886
0
        DWRF_U8(DWRF_REG_RA);                 // Return address register number
887
0
        DWRF_UV(1);                           // Augmentation data length
888
0
        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
889
890
        /* Initial CFI instructions - describe default calling convention */
891
0
#ifdef __x86_64__
892
        /* x86_64 initial CFI state */
893
0
        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
894
0
        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
895
0
        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
896
0
        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
897
0
        DWRF_UV(1);                           // At offset 1 from CFA
898
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
899
        /* AArch64 initial CFI state */
900
        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
901
        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
902
        DWRF_UV(0);                           // CFA = SP + 0 (AArch64 starts with offset 0)
903
        // No initial register saves in AArch64 CIE
904
#endif
905
0
        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
906
0
    )
907
908
0
    ctx->eh_frame_p = p;  // Remember start of FDE data
909
910
    /*
911
     * Emit DWARF EH FDE (Frame Description Entry)
912
     *
913
     * The FDE describes unwinding information specific to this function.
914
     * It references the CIE and provides function-specific CFI instructions.
915
     *
916
     * The PC-relative offset is calculated after the entire EH frame is built
917
     * to ensure accurate positioning relative to the synthesized DSO layout.
918
     */
919
0
    DWRF_SECTION(FDE,
920
0
        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
921
0
        ctx->fde_p = p;                        // Remember where PC offset field is located for later calculation
922
0
        DWRF_U32(0);                           // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe)
923
0
        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code length)
924
0
        DWRF_U8(0);                           // Augmentation data length (none)
925
926
        /*
927
         * Architecture-specific CFI instructions
928
         *
929
         * These instructions describe how registers are saved and restored
930
         * during function calls. Each architecture has different calling
931
         * conventions and register usage patterns.
932
         */
933
0
#ifdef __x86_64__
934
        /* x86_64 calling convention unwinding rules with frame pointer */
935
#  if defined(__CET__) && (__CET__ & 1)
936
        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance past endbr64 (4 bytes)
937
#  endif
938
0
        DWRF_U8(DWRF_CFA_advance_loc | 1);    // Advance past push %rbp (1 byte)
939
0
        DWRF_U8(DWRF_CFA_def_cfa_offset);     // def_cfa_offset 16
940
0
        DWRF_UV(16);                          // New offset: SP + 16
941
0
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
942
0
        DWRF_UV(2);                           // Offset factor: 2 * 8 = 16 bytes
943
0
        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past mov %rsp,%rbp (3 bytes)
944
0
        DWRF_U8(DWRF_CFA_def_cfa_register);   // def_cfa_register r6
945
0
        DWRF_UV(DWRF_REG_BP);                 // Use base pointer register
946
0
        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
947
0
        DWRF_U8(DWRF_CFA_def_cfa);            // def_cfa r7 ofs 8
948
0
        DWRF_UV(DWRF_REG_SP);                 // Use stack pointer register
949
0
        DWRF_UV(8);                           // New offset: SP + 8
950
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
951
        /* AArch64 calling convention unwinding rules */
952
        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance by 1 instruction (4 bytes)
953
        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 16
954
        DWRF_UV(16);                              // Stack pointer moved by 16 bytes
955
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // x29 (frame pointer) saved
956
        DWRF_UV(2);                               // At CFA-16 (2 * 8 = 16 bytes from CFA)
957
        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // x30 (link register) saved
958
        DWRF_UV(1);                               // At CFA-8 (1 * 8 = 8 bytes from CFA)
959
        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (12 bytes)
960
        DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA);  // Restore x30 - NO DWRF_UV() after this!
961
        DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP);  // Restore x29 - NO DWRF_UV() after this!
962
        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 0 (stack restored)
963
        DWRF_UV(0);                               // Back to original stack position
964
#else
965
#    error "Unsupported target architecture"
966
#endif
967
968
0
        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
969
0
    )
970
971
0
    ctx->p = p;  // Update context pointer to end of generated data
972
973
    /* Calculate and update the PC-relative offset in the FDE
974
     *
975
     * When perf processes the jitdump, it creates a synthesized DSO with this layout:
976
     *
977
     *     Synthesized DSO Memory Layout:
978
     *     ┌─────────────────────────────────────────────────────────────┐ < code_start
979
     *     │                        Code Section                         │
980
     *     │                    (round_up(code_size, 8) bytes)           │
981
     *     ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
982
     *     │                      EH Frame Data                          │
983
     *     │  ┌─────────────────────────────────────────────────────┐    │
984
     *     │  │                 CIE data                            │    │
985
     *     │  └─────────────────────────────────────────────────────┘    │
986
     *     │  ┌─────────────────────────────────────────────────────┐    │
987
     *     │  │ FDE Header:                                         │    │
988
     *     │  │   - CIE offset (4 bytes)                            │    │
989
     *     │  │   - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
990
     *     │  │   - address range (4 bytes)                         │    │   (this specific field)
991
     *     │  │ CFI Instructions...                                 │    │
992
     *     │  └─────────────────────────────────────────────────────┘    │
993
     *     ├─────────────────────────────────────────────────────────────┤ < reference_point
994
     *     │                    EhFrameHeader                            │
995
     *     │                 (navigation metadata)                       │
996
     *     └─────────────────────────────────────────────────────────────┘
997
     *
998
     * The PC offset field in the FDE must contain the distance from itself to code_start:
999
     *
1000
     *   distance = code_start - fde_pc_field
1001
     *
1002
     * Where:
1003
     *   fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
1004
     *   code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
1005
     *
1006
     * Therefore:
1007
     *   distance = code_start_location - fde_pc_field_location
1008
     *            = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
1009
     *            = -rounded_code_size - fde_offset_in_frame
1010
     *            = -(round_up(code_size, 8) + fde_offset_in_frame)
1011
     *
1012
     * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field,
1013
     *
1014
     */
1015
0
    if (ctx->fde_p != NULL) {
1016
0
        int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp);
1017
0
        int32_t rounded_code_size = round_up(ctx->code_size, 8);
1018
0
        int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame);
1019
1020
1021
        // Update the PC-relative offset in the FDE
1022
0
        *(int32_t*)ctx->fde_p = pc_relative_offset;
1023
0
    }
1024
0
}
1025
1026
// =============================================================================
1027
//                              JITDUMP INITIALIZATION
1028
// =============================================================================
1029
1030
/*
1031
 * Initialize the perf jitdump interface
1032
 *
1033
 * This function sets up everything needed to generate jitdump files:
1034
 * 1. Creates the jitdump file with a unique name
1035
 * 2. Maps the first page to signal perf that we're using the interface
1036
 * 3. Writes the jitdump header
1037
 * 4. Initializes synchronization primitives
1038
 *
1039
 * The memory mapping is crucial - perf detects jitdump files by scanning
1040
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
1041
 *
1042
 * Returns: Pointer to initialized state, or NULL on failure
1043
 */
1044
0
static void* perf_map_jit_init(void) {
1045
0
    char filename[100];
1046
0
    int pid = getpid();
1047
1048
    /* Create unique filename based on process ID */
1049
0
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
1050
1051
    /* Create/open the jitdump file with appropriate permissions */
1052
0
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
1053
0
    if (fd == -1) {
1054
0
        return NULL;  // Failed to create file
1055
0
    }
1056
1057
    /* Get system page size for memory mapping */
1058
0
    const long page_size = sysconf(_SC_PAGESIZE);
1059
0
    if (page_size == -1) {
1060
0
        close(fd);
1061
0
        return NULL;  // Failed to get page size
1062
0
    }
1063
1064
#if defined(__APPLE__)
1065
    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
1066
    perf_jit_map_state.mapped_buffer = NULL;
1067
#else
1068
    /*
1069
     * Map the first page of the jitdump file
1070
     *
1071
     * This memory mapping serves as a signal to perf that this process
1072
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
1073
     * files that match the jitdump naming pattern.
1074
     *
1075
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
1076
     */
1077
0
    perf_jit_map_state.mapped_buffer = mmap(
1078
0
        NULL,                    // Let kernel choose address
1079
0
        page_size,               // Map one page
1080
0
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
1081
0
        MAP_PRIVATE,             // Private mapping
1082
0
        fd,                      // File descriptor
1083
0
        0                        // Offset 0 (first page)
1084
0
    );
1085
1086
0
    if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
1087
0
        perf_jit_map_state.mapped_buffer = NULL;
1088
0
        close(fd);
1089
0
        return NULL;  // Memory mapping failed
1090
0
    }
1091
0
    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
1092
0
                               "cpython:perf_jit_trampoline");
1093
0
#endif
1094
1095
0
    perf_jit_map_state.mapped_size = page_size;
1096
1097
    /* Convert file descriptor to FILE* for easier I/O operations */
1098
0
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
1099
0
    if (perf_jit_map_state.perf_map == NULL) {
1100
0
        close(fd);
1101
0
        return NULL;  // Failed to create FILE*
1102
0
    }
1103
1104
    /*
1105
     * Set up file buffering for better performance
1106
     *
1107
     * We use a large buffer (2MB) because jitdump files can be written
1108
     * frequently during program execution. Buffering reduces system call
1109
     * overhead and improves overall performance.
1110
     */
1111
0
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
1112
1113
    /* Write the jitdump file header */
1114
0
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
1115
1116
    /*
1117
     * Initialize thread synchronization lock
1118
     *
1119
     * Multiple threads may attempt to write to the jitdump file
1120
     * simultaneously. This lock ensures thread-safe access to the
1121
     * global jitdump state.
1122
     */
1123
0
    perf_jit_map_state.map_lock = PyThread_allocate_lock();
1124
0
    if (perf_jit_map_state.map_lock == NULL) {
1125
0
        fclose(perf_jit_map_state.perf_map);
1126
0
        return NULL;  // Failed to create lock
1127
0
    }
1128
1129
    /* Initialize code ID counter */
1130
0
    perf_jit_map_state.code_id = 0;
1131
1132
    /* Calculate padding size based on actual unwind info requirements */
1133
0
    size_t eh_frame_size = calculate_eh_frame_size();
1134
0
    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
1135
0
    trampoline_api.code_padding = round_up(unwind_data_size, 16);
1136
0
    trampoline_api.code_alignment = 32;
1137
1138
0
    return &perf_jit_map_state;
1139
0
}
1140
1141
// =============================================================================
1142
//                              MAIN JITDUMP ENTRY WRITING
1143
// =============================================================================
1144
1145
/*
1146
 * Write a complete jitdump entry for a Python function
1147
 *
1148
 * This is the main function called by Python's trampoline system whenever
1149
 * a new piece of JIT-compiled code needs to be recorded. It writes both
1150
 * the unwinding information and the code load event to the jitdump file.
1151
 *
1152
 * The function performs these steps:
1153
 * 1. Initialize jitdump system if not already done
1154
 * 2. Extract function name and filename from Python code object
1155
 * 3. Generate DWARF unwinding information
1156
 * 4. Write unwinding info event to jitdump file
1157
 * 5. Write code load event to jitdump file
1158
 *
1159
 * Args:
1160
 *   state: Jitdump state (currently unused, uses global state)
1161
 *   code_addr: Address where the compiled code resides
1162
 *   code_size: Size of the compiled code in bytes
1163
 *   co: Python code object containing metadata
1164
 *
1165
 * IMPORTANT: This function signature is part of Python's internal API
1166
 * and must not be changed without coordinating with core Python development.
1167
 */
1168
static void perf_map_jit_write_entry(void *state, const void *code_addr,
1169
                                    unsigned int code_size, PyCodeObject *co)
1170
0
{
1171
    /* Initialize jitdump system on first use */
1172
0
    if (perf_jit_map_state.perf_map == NULL) {
1173
0
        void* ret = perf_map_jit_init();
1174
0
        if(ret == NULL){
1175
0
            return;  // Initialization failed, silently abort
1176
0
        }
1177
0
    }
1178
1179
    /*
1180
     * Extract function information from Python code object
1181
     *
1182
     * We create a human-readable function name by combining the qualified
1183
     * name (includes class/module context) with the filename. This helps
1184
     * developers identify functions in perf reports.
1185
     */
1186
0
    const char *entry = "";
1187
0
    if (co->co_qualname != NULL) {
1188
0
        entry = PyUnicode_AsUTF8(co->co_qualname);
1189
0
    }
1190
1191
0
    const char *filename = "";
1192
0
    if (co->co_filename != NULL) {
1193
0
        filename = PyUnicode_AsUTF8(co->co_filename);
1194
0
    }
1195
1196
    /*
1197
     * Create formatted function name for perf display
1198
     *
1199
     * Format: "py::<function_name>:<filename>"
1200
     * The "py::" prefix helps identify Python functions in mixed-language
1201
     * profiles (e.g., when profiling C extensions alongside Python code).
1202
     */
1203
0
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
1204
0
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
1205
0
    if (perf_map_entry == NULL) {
1206
0
        return;  // Memory allocation failed
1207
0
    }
1208
0
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
1209
1210
0
    const size_t name_length = strlen(perf_map_entry);
1211
0
    uword base = (uword)code_addr;
1212
0
    uword size = code_size;
1213
1214
    /*
1215
     * Generate DWARF unwinding information
1216
     *
1217
     * DWARF data is essential for proper stack unwinding during profiling.
1218
     * Without it, perf cannot generate accurate call graphs, especially
1219
     * in optimized code where frame pointers may be omitted.
1220
     */
1221
0
    ELFObjectContext ctx;
1222
0
    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
1223
0
    ctx.code_size = code_size;
1224
0
    ctx.startp = ctx.p = (uint8_t*)buffer;
1225
0
    ctx.fde_p = NULL;  // Initialize to NULL, will be set when FDE is written
1226
1227
    /* Generate EH frame (Exception Handling frame) data */
1228
0
    elf_init_ehframe(&ctx);
1229
0
    int eh_frame_size = ctx.p - ctx.startp;
1230
1231
    /*
1232
     * Write Code Unwinding Information Event
1233
     *
1234
     * This event must be written before the code load event to ensure
1235
     * perf has the unwinding information available when it processes
1236
     * the code region.
1237
     */
1238
0
    CodeUnwindingInfoEvent ev2;
1239
0
    ev2.base.event = PerfUnwindingInfo;
1240
0
    ev2.base.time_stamp = get_current_monotonic_ticks();
1241
0
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
1242
1243
    /* Verify we don't exceed our padding budget */
1244
0
    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
1245
1246
0
    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
1247
0
    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
1248
1249
    /* Calculate total event size with padding */
1250
0
    int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
1251
0
    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
1252
0
    ev2.base.size = content_size + padding_size;
1253
1254
    /* Write the unwinding info event header */
1255
0
    perf_map_jit_write_fully(&ev2, sizeof(ev2));
1256
1257
    /*
1258
     * Write EH Frame Header
1259
     *
1260
     * The EH frame header provides metadata about the DWARF unwinding
1261
     * information that follows. It includes pointers and counts that
1262
     * help perf navigate the unwinding data efficiently.
1263
     */
1264
0
    EhFrameHeader f;
1265
0
    f.version = 1;
1266
0
    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
1267
0
    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
1268
0
    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
1269
1270
    /* Calculate relative offsets for EH frame navigation */
1271
0
    f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
1272
0
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
1273
0
    f.from = -(round_up(code_size, 8) + eh_frame_size);
1274
1275
0
    int cie_size = ctx.eh_frame_p - ctx.startp;
1276
0
    f.to = -(eh_frame_size - cie_size);
1277
1278
    /* Write EH frame data and header */
1279
0
    perf_map_jit_write_fully(ctx.startp, eh_frame_size);
1280
0
    perf_map_jit_write_fully(&f, sizeof(f));
1281
1282
    /* Write padding to maintain alignment */
1283
0
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
1284
0
    perf_map_jit_write_fully(&padding_bytes, padding_size);
1285
1286
    /*
1287
     * Write Code Load Event
1288
     *
1289
     * This event tells perf about the new code region. It includes:
1290
     * - Memory addresses and sizes
1291
     * - Process and thread identification
1292
     * - Function name for symbol resolution
1293
     * - The actual machine code bytes
1294
     */
1295
0
    CodeLoadEvent ev;
1296
0
    ev.base.event = PerfLoad;
1297
0
    ev.base.size = sizeof(ev) + (name_length+1) + size;
1298
0
    ev.base.time_stamp = get_current_monotonic_ticks();
1299
0
    ev.process_id = getpid();
1300
#if defined(__APPLE__)
1301
    pthread_threadid_np(NULL, &ev.thread_id);
1302
#else
1303
0
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
1304
0
#endif
1305
0
    ev.vma = base;                       // Virtual memory address
1306
0
    ev.code_address = base;              // Same as VMA for our use case
1307
0
    ev.code_size = size;
1308
1309
    /* Assign unique code ID and increment counter */
1310
0
    perf_jit_map_state.code_id += 1;
1311
0
    ev.code_id = perf_jit_map_state.code_id;
1312
1313
    /* Write code load event and associated data */
1314
0
    perf_map_jit_write_fully(&ev, sizeof(ev));
1315
0
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
1316
0
    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
1317
1318
    /* Clean up allocated memory */
1319
0
    PyMem_RawFree(perf_map_entry);
1320
0
}
1321
1322
// =============================================================================
1323
//                              CLEANUP AND FINALIZATION
1324
// =============================================================================
1325
1326
/*
1327
 * Finalize and cleanup the perf jitdump system
1328
 *
1329
 * This function is called when Python is shutting down or when the
1330
 * perf trampoline system is being disabled. It ensures all resources
1331
 * are properly released and all buffered data is flushed to disk.
1332
 *
1333
 * Args:
1334
 *   state: Jitdump state (currently unused, uses global state)
1335
 *
1336
 * Returns: 0 on success
1337
 *
1338
 * IMPORTANT: This function signature is part of Python's internal API
1339
 * and must not be changed without coordinating with core Python development.
1340
 */
1341
0
static int perf_map_jit_fini(void* state) {
1342
    /*
1343
     * Close jitdump file with proper synchronization
1344
     *
1345
     * We need to acquire the lock to ensure no other threads are
1346
     * writing to the file when we close it. This prevents corruption
1347
     * and ensures all data is properly flushed.
1348
     */
1349
0
    if (perf_jit_map_state.perf_map != NULL) {
1350
0
        PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
1351
0
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
1352
0
        PyThread_release_lock(perf_jit_map_state.map_lock);
1353
1354
        /* Clean up synchronization primitive */
1355
0
        PyThread_free_lock(perf_jit_map_state.map_lock);
1356
0
        perf_jit_map_state.perf_map = NULL;
1357
0
    }
1358
1359
    /*
1360
     * Unmap the memory region
1361
     *
1362
     * This removes the signal to perf that we were generating JIT code.
1363
     * After this point, perf will no longer detect this process as
1364
     * having JIT capabilities.
1365
     */
1366
0
    if (perf_jit_map_state.mapped_buffer != NULL) {
1367
0
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
1368
0
        perf_jit_map_state.mapped_buffer = NULL;
1369
0
    }
1370
1371
    /* Clear global state reference */
1372
0
    trampoline_api.state = NULL;
1373
1374
0
    return 0;  // Success
1375
0
}
1376
1377
// =============================================================================
1378
//                              PUBLIC API EXPORT
1379
// =============================================================================
1380
1381
/*
1382
 * Python Perf Callbacks Structure
1383
 *
1384
 * This structure defines the callback interface that Python's trampoline
1385
 * system uses to integrate with perf profiling. It contains function
1386
 * pointers for initialization, event writing, and cleanup.
1387
 *
1388
 * CRITICAL: This structure and its contents are part of Python's internal
1389
 * API. The function signatures and behavior must remain stable to maintain
1390
 * compatibility with the Python interpreter's perf integration system.
1391
 *
1392
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
1393
 */
1394
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
1395
    &perf_map_jit_init,        // Initialization function
1396
    &perf_map_jit_write_entry, // Event writing function
1397
    &perf_map_jit_fini,        // Cleanup function
1398
};
1399
1400
#endif /* PY_HAVE_PERF_TRAMPOLINE */