/src/cpython/Python/perf_jit_trampoline.c
Line | Count | Source |
1 | | /* |
2 | | * Python Perf Trampoline Support - JIT Dump Implementation |
3 | | * |
4 | | * This file implements the perf jitdump API for Python's performance profiling |
5 | | * integration. It allows perf (Linux performance analysis tool) to understand |
6 | | * and profile dynamically generated Python bytecode by creating JIT dump files |
7 | | * that perf can inject into its analysis. |
8 | | * |
9 | | * |
10 | | * IMPORTANT: This file exports specific callback functions that are part of |
11 | | * Python's internal API. Do not modify the function signatures or behavior |
12 | | * of exported functions without coordinating with the Python core team. |
13 | | * |
14 | | * Usually the binary and libraries are mapped in separate region like below: |
15 | | * |
16 | | * address -> |
17 | | * --+---------------------+--//--+---------------------+-- |
18 | | * | .text | .data | ... | | .text | .data | ... | |
19 | | * --+---------------------+--//--+---------------------+-- |
20 | | * myprog libc.so |
21 | | * |
22 | | * So it'd be easy and straight-forward to find a mapped binary or library from an |
23 | | * address. |
24 | | * |
25 | | * But for JIT code, the code arena only cares about the code section. But the |
26 | | * resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
27 | | * unwind info too. Then it'd generate following address space with synthesized |
28 | | * MMAP events. Let's say it has a sample between address B and C. |
29 | | * |
30 | | * sample |
31 | | * | |
32 | | * address -> A B v C |
33 | | * --------------------------------------------------------------------------------------------------- |
34 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
35 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
36 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
37 | | * ... |
38 | | * --------------------------------------------------------------------------------------------------- |
39 | | * |
40 | | * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
41 | | * the unwind info. If it maps both .text section and unwind sections, the sample |
42 | | * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
43 | | * which one is right. So to make perf happy we have non-overlapping ranges for each |
44 | | * DSO: |
45 | | * |
46 | | * address -> |
47 | | * ------------------------------------------------------------------------------------------------------- |
48 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
49 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
50 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
51 | | * ... |
52 | | * ------------------------------------------------------------------------------------------------------- |
53 | | * |
54 | | * As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
55 | | * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
56 | | */ |
57 | | |
58 | | |
59 | | |
60 | | #include "Python.h" |
61 | | #include "pycore_ceval.h" // _PyPerf_Callbacks |
62 | | #include "pycore_frame.h" |
63 | | #include "pycore_interp.h" |
64 | | #include "pycore_mmap.h" // _PyAnnotateMemoryMap() |
65 | | #include "pycore_jit_unwind.h" |
66 | | #include "pycore_runtime.h" // _PyRuntime |
67 | | |
68 | | #ifdef PY_HAVE_PERF_TRAMPOLINE |
69 | | |
70 | | /* Standard library includes for perf jitdump implementation */ |
71 | | #if defined(__linux__) |
72 | | # include <elf.h> // ELF architecture constants |
73 | | #endif |
74 | | #include <fcntl.h> // File control operations |
75 | | #include <stdio.h> // Standard I/O operations |
76 | | #include <stdlib.h> // Standard library functions |
77 | | #include <string.h> // memcpy, strlen |
78 | | #include <sys/mman.h> // Memory mapping functions (mmap) |
79 | | #include <sys/types.h> // System data types |
80 | | #include <unistd.h> // System calls (sysconf, getpid) |
81 | | #include <sys/time.h> // Time functions (gettimeofday) |
82 | | #if defined(__linux__) |
83 | | # include <sys/syscall.h> // System call interface |
84 | | #endif |
85 | | #if defined(__APPLE__) |
86 | | # include <mach/mach_time.h> // mach_absolute_time, mach_timebase_info |
87 | | #endif |
88 | | |
89 | | // ============================================================================= |
90 | | // CONSTANTS AND CONFIGURATION |
91 | | // ============================================================================= |
92 | | |
93 | | /* |
94 | | * Memory layout considerations for perf jitdump: |
95 | | * |
96 | | * Perf expects non-overlapping memory regions for each JIT-compiled function. |
97 | | * When perf processes the jitdump file, it creates synthetic DSO (Dynamic |
98 | | * Shared Object) files that contain: |
99 | | * - ELF headers |
100 | | * - .text section (actual machine code) |
101 | | * - Unwind information (for stack traces) |
102 | | * |
103 | | * To ensure proper address space layout, we add padding between code regions. |
104 | | * This prevents address conflicts when perf maps the synthesized DSOs. |
105 | | * |
106 | | * Memory layout example: |
107 | | * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] |
108 | | * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] |
109 | | * |
110 | | * The padding size is now calculated automatically during initialization |
111 | | * based on the actual unwind information requirements. |
112 | | */ |
113 | | |
114 | | |
115 | | /* These constants are defined inside <elf.h>, which we can't use outside of linux. */ |
116 | | #if !defined(__linux__) |
117 | | # if defined(__i386__) || defined(_M_IX86) |
118 | | # define EM_386 3 |
119 | | # elif defined(__arm__) || defined(_M_ARM) |
120 | | # define EM_ARM 40 |
121 | | # elif defined(__x86_64__) || defined(_M_X64) |
122 | | # define EM_X86_64 62 |
123 | | # elif defined(__aarch64__) |
124 | | # define EM_AARCH64 183 |
125 | | # elif defined(__riscv) |
126 | | # define EM_RISCV 243 |
127 | | # endif |
128 | | #endif |
129 | | |
130 | | /* Convenient access to the global trampoline API state */ |
131 | 0 | #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
132 | | |
133 | | /* Type aliases for clarity and portability */ |
134 | | typedef uint64_t uword; // Word-sized unsigned integer |
135 | | typedef const char* CodeComments; // Code comment strings |
136 | | |
137 | | /* Memory size constants */ |
138 | 0 | #define MB (1024 * 1024) // 1 Megabyte for buffer sizing |
139 | | |
140 | | // ============================================================================= |
141 | | // ARCHITECTURE-SPECIFIC DEFINITIONS |
142 | | // ============================================================================= |
143 | | |
144 | | /* |
145 | | * Returns the ELF machine architecture constant for the current platform. |
146 | | * This is required for the jitdump header to correctly identify the target |
147 | | * architecture for perf processing. |
148 | | * |
149 | | */ |
150 | 0 | static uint64_t GetElfMachineArchitecture(void) { |
151 | 0 | #if defined(__x86_64__) || defined(_M_X64) |
152 | 0 | return EM_X86_64; |
153 | | #elif defined(__i386__) || defined(_M_IX86) |
154 | | return EM_386; |
155 | | #elif defined(__aarch64__) |
156 | | return EM_AARCH64; |
157 | | #elif defined(__arm__) || defined(_M_ARM) |
158 | | return EM_ARM; |
159 | | #elif defined(__riscv) |
160 | | return EM_RISCV; |
161 | | #else |
162 | | Py_UNREACHABLE(); // Unsupported architecture - should never reach here |
163 | | return 0; |
164 | | #endif |
165 | 0 | } |
166 | | |
167 | | // ============================================================================= |
168 | | // PERF JITDUMP DATA STRUCTURES |
169 | | // ============================================================================= |
170 | | |
171 | | /* |
172 | | * Perf jitdump file format structures |
173 | | * |
174 | | * These structures define the binary format that perf expects for JIT dump files. |
175 | | * The format is documented in the Linux perf tools source code and must match |
176 | | * exactly for proper perf integration. |
177 | | */ |
178 | | |
179 | | /* |
180 | | * Jitdump file header - written once at the beginning of each jitdump file |
181 | | * Contains metadata about the process and jitdump format version |
182 | | */ |
183 | | typedef struct { |
184 | | uint32_t magic; // Magic number (0x4A695444 = "JiTD") |
185 | | uint32_t version; // Jitdump format version (currently 1) |
186 | | uint32_t size; // Size of this header structure |
187 | | uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) |
188 | | uint32_t reserved; // Reserved field (must be 0) |
189 | | uint32_t process_id; // Process ID of the JIT compiler |
190 | | uint64_t time_stamp; // Timestamp when jitdump was created |
191 | | uint64_t flags; // Feature flags (currently unused) |
192 | | } Header; |
193 | | |
194 | | /* |
195 | | * Perf event types supported by the jitdump format |
196 | | * Each event type has a corresponding structure format |
197 | | */ |
198 | | enum PerfEvent { |
199 | | PerfLoad = 0, // Code load event (new JIT function) |
200 | | PerfMove = 1, // Code move event (function relocated) |
201 | | PerfDebugInfo = 2, // Debug information event |
202 | | PerfClose = 3, // JIT session close event |
203 | | PerfUnwindingInfo = 4 // Stack unwinding information event |
204 | | }; |
205 | | |
206 | | /* |
207 | | * Base event structure - common header for all perf events |
208 | | * Every event in the jitdump file starts with this structure |
209 | | */ |
210 | | struct BaseEvent { |
211 | | uint32_t event; // Event type (from PerfEvent enum) |
212 | | uint32_t size; // Total size of this event including payload |
213 | | uint64_t time_stamp; // Timestamp when event occurred |
214 | | }; |
215 | | |
216 | | /* |
217 | | * Code load event - indicates a new JIT-compiled function is available |
218 | | * This is the most important event type for Python profiling |
219 | | */ |
220 | | typedef struct { |
221 | | struct BaseEvent base; // Common event header |
222 | | uint32_t process_id; // Process ID where code was generated |
223 | | uint32_t thread_id; // Thread ID where code was generated |
224 | | uint64_t vma; // Virtual memory address where code is loaded |
225 | | uint64_t code_address; // Address of the actual machine code |
226 | | uint64_t code_size; // Size of the machine code in bytes |
227 | | uint64_t code_id; // Unique identifier for this code region |
228 | | /* Followed by: |
229 | | * - null-terminated function name string |
230 | | * - raw machine code bytes |
231 | | */ |
232 | | } CodeLoadEvent; |
233 | | |
234 | | /* |
235 | | * Code unwinding information event - provides DWARF data for stack traces |
236 | | * Essential for proper stack unwinding during profiling |
237 | | */ |
238 | | typedef struct { |
239 | | struct BaseEvent base; // Common event header |
240 | | uint64_t unwind_data_size; // Size of the unwinding data |
241 | | uint64_t eh_frame_hdr_size; // Size of the EH frame header |
242 | | uint64_t mapped_size; // Total mapped size (with padding) |
243 | | /* Followed by: |
244 | | * - EH frame header |
245 | | * - DWARF unwinding information |
246 | | * - Padding to alignment boundary |
247 | | */ |
248 | | } CodeUnwindingInfoEvent; |
249 | | |
250 | | /* |
251 | | * EH Frame Header structure for DWARF unwinding |
252 | | * |
253 | | * This header provides metadata about the .eh_frame data that follows. |
254 | | * It uses PC-relative and data-relative encodings to keep the synthesized |
255 | | * DSO self-contained when perf injects it. |
256 | | */ |
257 | | typedef struct __attribute__((packed)) { |
258 | | uint8_t version; |
259 | | uint8_t eh_frame_ptr_enc; |
260 | | uint8_t fde_count_enc; |
261 | | uint8_t table_enc; |
262 | | int32_t eh_frame_ptr; |
263 | | uint32_t eh_fde_count; |
264 | | int32_t from; |
265 | | int32_t to; |
266 | | } EhFrameHeader; |
267 | | _Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); |
268 | | |
269 | | // ============================================================================= |
270 | | // GLOBAL STATE MANAGEMENT |
271 | | // ============================================================================= |
272 | | |
273 | | /* |
274 | | * Global state for the perf jitdump implementation |
275 | | * |
276 | | * This structure maintains all the state needed for generating jitdump files. |
277 | | * It's designed as a singleton since there's typically only one jitdump file |
278 | | * per Python process. |
279 | | */ |
280 | | typedef struct { |
281 | | FILE* perf_map; // File handle for the jitdump file |
282 | | PyMutex map_lock; // Thread synchronization lock |
283 | | void* mapped_buffer; // Memory-mapped region (signals perf we're active) |
284 | | size_t mapped_size; // Size of the mapped region |
285 | | uint32_t code_id; // Counter for unique code region identifiers |
286 | | uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs |
287 | | } PerfMapJitState; |
288 | | |
289 | | /* Global singleton instance */ |
290 | | static PerfMapJitState perf_jit_map_state; |
291 | | |
292 | | // ============================================================================= |
293 | | // TIME UTILITIES |
294 | | // ============================================================================= |
295 | | |
296 | | /* Time conversion constant */ |
297 | | #if !defined(__APPLE__) |
298 | | static const intptr_t nanoseconds_per_second = 1000000000; |
299 | | #endif |
300 | | |
301 | | /* |
302 | | * Get current monotonic time in nanoseconds |
303 | | * |
304 | | * Monotonic time is preferred for event timestamps because it's not affected |
305 | | * by system clock adjustments. This ensures consistent timing relationships |
306 | | * between events even if the system clock is changed. |
307 | | * |
308 | | * Returns: Current monotonic time in nanoseconds since an arbitrary epoch |
309 | | */ |
310 | 0 | static int64_t get_current_monotonic_ticks(void) { |
311 | | #if defined(__APPLE__) |
312 | | // On macOS the jitdump file is consumed by profilers (such as samply) that |
313 | | // timestamp their samples using mach_absolute_time(). The jitdump event |
314 | | // timestamps must use the same clock domain, otherwise the JIT code |
315 | | // mappings cannot be lined up with the samples. |
316 | | static mach_timebase_info_data_t timebase = {0, 0}; |
317 | | if (timebase.denom == 0) { |
318 | | (void)mach_timebase_info(&timebase); |
319 | | } |
320 | | uint64_t ticks = mach_absolute_time(); |
321 | | return (int64_t)(ticks * timebase.numer / timebase.denom); |
322 | | #else |
323 | 0 | struct timespec ts; |
324 | 0 | if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
325 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
326 | 0 | return 0; |
327 | 0 | } |
328 | | |
329 | | /* Convert to nanoseconds for maximum precision */ |
330 | 0 | int64_t result = ts.tv_sec; |
331 | 0 | result *= nanoseconds_per_second; |
332 | 0 | result += ts.tv_nsec; |
333 | 0 | return result; |
334 | 0 | #endif |
335 | 0 | } |
336 | | |
337 | | /* |
338 | | * Get current wall clock time in microseconds |
339 | | * |
340 | | * Used for the jitdump file header timestamp. Unlike monotonic time, |
341 | | * this represents actual wall clock time that can be correlated with |
342 | | * other system events. |
343 | | * |
344 | | * Returns: Current time in microseconds since Unix epoch |
345 | | */ |
346 | 0 | static int64_t get_current_time_microseconds(void) { |
347 | 0 | struct timeval tv; |
348 | 0 | if (gettimeofday(&tv, NULL) < 0) { |
349 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
350 | 0 | return 0; |
351 | 0 | } |
352 | 0 | return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
353 | 0 | } |
354 | | |
355 | | // ============================================================================= |
356 | | // FILE I/O UTILITIES |
357 | | // ============================================================================= |
358 | | |
359 | | /* |
360 | | * Write data to the jitdump file with error handling |
361 | | * |
362 | | * This function ensures that all data is written to the file, handling |
363 | | * partial writes that can occur with large buffers or when the system |
364 | | * is under load. |
365 | | * |
366 | | * Args: |
367 | | * buffer: Pointer to data to write |
368 | | * size: Number of bytes to write |
369 | | */ |
370 | 0 | static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
371 | 0 | FILE* out_file = perf_jit_map_state.perf_map; |
372 | 0 | const char* ptr = (const char*)(buffer); |
373 | |
|
374 | 0 | while (size > 0) { |
375 | 0 | const size_t written = fwrite(ptr, 1, size, out_file); |
376 | 0 | if (written == 0) { |
377 | 0 | Py_UNREACHABLE(); // Write failure - should be very rare |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | size -= written; |
381 | 0 | ptr += written; |
382 | 0 | } |
383 | 0 | } |
384 | | |
385 | | /* |
386 | | * Write the jitdump file header |
387 | | * |
388 | | * The header must be written exactly once at the beginning of each jitdump |
389 | | * file. It provides metadata that perf uses to parse the rest of the file. |
390 | | * |
391 | | * Args: |
392 | | * pid: Process ID to include in the header |
393 | | * out_file: File handle to write to (currently unused, uses global state) |
394 | | */ |
395 | 0 | static void perf_map_jit_write_header(int pid, FILE* out_file) { |
396 | 0 | Header header; |
397 | | |
398 | | /* Initialize header with required values */ |
399 | 0 | header.magic = 0x4A695444; // "JiTD" magic number |
400 | 0 | header.version = 1; // Current jitdump version |
401 | 0 | header.size = sizeof(Header); // Header size for validation |
402 | 0 | header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture |
403 | 0 | header.reserved = 0; // padding reserved for future use |
404 | 0 | header.process_id = pid; // Process identifier |
405 | 0 | header.time_stamp = get_current_time_microseconds(); // Creation time |
406 | 0 | header.flags = 0; // No special flags currently used |
407 | |
|
408 | 0 | perf_map_jit_write_fully(&header, sizeof(header)); |
409 | 0 | } |
410 | | |
411 | | // ============================================================================= |
412 | | // JITDUMP INITIALIZATION |
413 | | // ============================================================================= |
414 | | |
415 | | /* |
416 | | * Initialize the perf jitdump interface |
417 | | * |
418 | | * This function sets up everything needed to generate jitdump files: |
419 | | * 1. Creates the jitdump file with a unique name |
420 | | * 2. Maps the first page to signal perf that we're using the interface |
421 | | * 3. Writes the jitdump header |
422 | | * 4. Initializes synchronization primitives |
423 | | * |
424 | | * The memory mapping is crucial - perf detects jitdump files by scanning |
425 | | * for processes that have mapped files matching the pattern /tmp/jit-*.dump |
426 | | * |
427 | | * Returns: Pointer to initialized state, or NULL on failure |
428 | | */ |
429 | 0 | static void* perf_map_jit_init(void) { |
430 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
431 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
432 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
433 | 0 | return &perf_jit_map_state; |
434 | 0 | } |
435 | | |
436 | 0 | char filename[100]; |
437 | 0 | int pid = getpid(); |
438 | | |
439 | | /* Create unique filename based on process ID */ |
440 | 0 | snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
441 | | |
442 | | /* Create/open the jitdump file with appropriate permissions */ |
443 | 0 | const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
444 | 0 | if (fd == -1) { |
445 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
446 | 0 | return NULL; // Failed to create file |
447 | 0 | } |
448 | | |
449 | | /* Get system page size for memory mapping */ |
450 | 0 | const long page_size = sysconf(_SC_PAGESIZE); |
451 | 0 | if (page_size == -1) { |
452 | 0 | close(fd); |
453 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
454 | 0 | return NULL; // Failed to get page size |
455 | 0 | } |
456 | | |
457 | | #if defined(__APPLE__) |
458 | | // On macOS, samply uses a preload to find jitdumps and this mmap can be slow. |
459 | | perf_jit_map_state.mapped_buffer = NULL; |
460 | | #else |
461 | | /* |
462 | | * Map the first page of the jitdump file |
463 | | * |
464 | | * This memory mapping serves as a signal to perf that this process |
465 | | * is generating JIT code. Perf scans /proc/.../maps looking for mapped |
466 | | * files that match the jitdump naming pattern. |
467 | | * |
468 | | * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. |
469 | | */ |
470 | 0 | perf_jit_map_state.mapped_buffer = mmap( |
471 | 0 | NULL, // Let kernel choose address |
472 | 0 | page_size, // Map one page |
473 | 0 | PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) |
474 | 0 | MAP_PRIVATE, // Private mapping |
475 | 0 | fd, // File descriptor |
476 | 0 | 0 // Offset 0 (first page) |
477 | 0 | ); |
478 | |
|
479 | 0 | if (perf_jit_map_state.mapped_buffer == MAP_FAILED) { |
480 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
481 | 0 | close(fd); |
482 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
483 | 0 | return NULL; // Memory mapping failed |
484 | 0 | } |
485 | 0 | (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size, |
486 | 0 | "cpython:perf_jit_trampoline"); |
487 | 0 | #endif |
488 | |
|
489 | 0 | perf_jit_map_state.mapped_size = page_size; |
490 | | |
491 | | /* Convert file descriptor to FILE* for easier I/O operations */ |
492 | 0 | perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
493 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
494 | 0 | close(fd); |
495 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
496 | 0 | return NULL; // Failed to create FILE* |
497 | 0 | } |
498 | | |
499 | | /* |
500 | | * Set up file buffering for better performance |
501 | | * |
502 | | * We use a large buffer (2MB) because jitdump files can be written |
503 | | * frequently during program execution. Buffering reduces system call |
504 | | * overhead and improves overall performance. |
505 | | */ |
506 | 0 | setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
507 | | |
508 | | /* Write the jitdump file header */ |
509 | 0 | perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
510 | | |
511 | | /* Initialize code ID counter */ |
512 | 0 | perf_jit_map_state.code_id = 0; |
513 | 0 | perf_jit_map_state.build_id_salt = |
514 | 0 | ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); |
515 | | |
516 | | /* Calculate padding size based on actual unwind info requirements */ |
517 | 0 | size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0); |
518 | 0 | size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
519 | 0 | trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); |
520 | 0 | trampoline_api.code_alignment = 32; |
521 | |
|
522 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
523 | 0 | return &perf_jit_map_state; |
524 | 0 | } |
525 | | |
526 | | // ============================================================================= |
527 | | // MAIN JITDUMP ENTRY WRITING |
528 | | // ============================================================================= |
529 | | |
530 | | /* |
531 | | * Write a complete jitdump entry for a code region with a provided name. |
532 | | * |
533 | | * This shares the same implementation as the trampoline callback, but |
534 | | * allows callers that don't have a PyCodeObject to reuse the jitdump |
535 | | * infrastructure. |
536 | | */ |
537 | | static void perf_map_jit_write_entry_with_name( |
538 | | void *state, |
539 | | const void *code_addr, |
540 | | size_t code_size, |
541 | | const char *entry, |
542 | | const char *filename |
543 | | ) |
544 | 0 | { |
545 | | /* Initialize jitdump system on first use */ |
546 | 0 | void* ret = perf_map_jit_init(); |
547 | 0 | if (ret == NULL) { |
548 | 0 | return; // Initialization failed, silently abort |
549 | 0 | } |
550 | | |
551 | 0 | if (entry == NULL) { |
552 | 0 | entry = ""; |
553 | 0 | } |
554 | 0 | if (filename == NULL) { |
555 | 0 | filename = ""; |
556 | 0 | } |
557 | | |
558 | | /* |
559 | | * Create formatted function name for perf display |
560 | | * |
561 | | * Format: "py::<function_name>:<filename>" |
562 | | * The "py::" prefix helps identify Python functions in mixed-language |
563 | | * profiles (e.g., when profiling C extensions alongside Python code). |
564 | | */ |
565 | 0 | size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
566 | 0 | char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
567 | 0 | if (perf_map_entry == NULL) { |
568 | 0 | return; // Memory allocation failed |
569 | 0 | } |
570 | 0 | snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
571 | |
|
572 | 0 | const size_t name_length = strlen(perf_map_entry); |
573 | 0 | uword base = (uword)code_addr; |
574 | 0 | uword size = code_size; |
575 | | |
576 | | /* |
577 | | * Generate DWARF unwinding information |
578 | | * |
579 | | * DWARF data is essential for proper stack unwinding during profiling. |
580 | | * Without it, perf cannot generate accurate call graphs, especially |
581 | | * in optimized code where frame pointers may be omitted. |
582 | | */ |
583 | 0 | uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
584 | 0 | size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( |
585 | 0 | buffer, sizeof(buffer), code_addr, code_size, 0); |
586 | 0 | if (eh_frame_size == 0) { |
587 | 0 | PyMem_RawFree(perf_map_entry); |
588 | 0 | return; |
589 | 0 | } |
590 | | |
591 | | /* |
592 | | * A logical jitdump entry is written as multiple records and also consumes |
593 | | * a process-global code_id. Serialize the whole sequence so concurrent JIT |
594 | | * compilation cannot interleave records or reuse an ID. |
595 | | */ |
596 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
597 | | |
598 | | /* |
599 | | * Write Code Unwinding Information Event |
600 | | * |
601 | | * This event must be written before the code load event to ensure |
602 | | * perf has the unwinding information available when it processes |
603 | | * the code region. |
604 | | */ |
605 | 0 | CodeUnwindingInfoEvent ev2; |
606 | 0 | ev2.base.event = PerfUnwindingInfo; |
607 | 0 | ev2.base.time_stamp = get_current_monotonic_ticks(); |
608 | 0 | ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
609 | | |
610 | | /* Verify we don't exceed our padding budget */ |
611 | 0 | assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); |
612 | |
|
613 | 0 | ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
614 | 0 | ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment |
615 | | |
616 | | /* Calculate total event size with padding */ |
617 | 0 | int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); |
618 | 0 | int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align |
619 | 0 | ev2.base.size = (uint32_t)(content_size + padding_size); |
620 | | |
621 | | /* Write the unwinding info event header */ |
622 | 0 | perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
623 | | |
624 | | /* |
625 | | * Write EH Frame Header |
626 | | * |
627 | | * The EH frame header provides metadata about the DWARF unwinding |
628 | | * information that follows. It includes pointers and counts that |
629 | | * help perf navigate the unwinding data efficiently. |
630 | | */ |
631 | 0 | EhFrameHeader f; |
632 | 0 | f.version = 1; |
633 | 0 | f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel; |
634 | 0 | f.fde_count_enc = DWRF_EH_PE_udata4; |
635 | 0 | f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel; |
636 | | |
637 | | /* Calculate relative offsets for EH frame navigation */ |
638 | 0 | f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); |
639 | 0 | f.eh_fde_count = 1; // We generate exactly one FDE per function |
640 | 0 | f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); |
641 | 0 | uint32_t cie_payload_size; |
642 | 0 | memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); |
643 | 0 | int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); |
644 | 0 | f.to = -(int32_t)(eh_frame_size - cie_size); |
645 | | |
646 | | /* Write EH frame data and header */ |
647 | 0 | perf_map_jit_write_fully(buffer, eh_frame_size); |
648 | 0 | perf_map_jit_write_fully(&f, sizeof(f)); |
649 | | |
650 | | /* Write padding to maintain alignment */ |
651 | 0 | char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
652 | 0 | perf_map_jit_write_fully(&padding_bytes, padding_size); |
653 | | |
654 | | /* |
655 | | * Write Code Load Event |
656 | | * |
657 | | * This event tells perf about the new code region. It includes: |
658 | | * - Memory addresses and sizes |
659 | | * - Process and thread identification |
660 | | * - Function name for symbol resolution |
661 | | * - The actual machine code bytes |
662 | | */ |
663 | 0 | CodeLoadEvent ev; |
664 | 0 | ev.base.event = PerfLoad; |
665 | 0 | ev.base.size = sizeof(ev) + (name_length+1) + size; |
666 | 0 | ev.base.time_stamp = get_current_monotonic_ticks(); |
667 | 0 | ev.process_id = getpid(); |
668 | | #if defined(__APPLE__) |
669 | | // The jitdump format defines the thread id field as a 32-bit value, but |
670 | | // pthread_threadid_np() returns a 64-bit id. Truncate it to 32 bits to |
671 | | // keep the record layout identical to other platforms. |
672 | | uint64_t thread_id = 0; |
673 | | pthread_threadid_np(NULL, &thread_id); |
674 | | ev.thread_id = (uint32_t)thread_id; |
675 | | #else |
676 | 0 | ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call |
677 | 0 | #endif |
678 | 0 | ev.vma = base; // Virtual memory address |
679 | 0 | ev.code_address = base; // Same as VMA for our use case |
680 | 0 | ev.code_size = size; |
681 | | |
682 | | /* Assign unique code ID and increment counter */ |
683 | 0 | perf_jit_map_state.code_id += 1; |
684 | 0 | ev.code_id = perf_jit_map_state.code_id; |
685 | | |
686 | | /* Write code load event and associated data */ |
687 | 0 | perf_map_jit_write_fully(&ev, sizeof(ev)); |
688 | 0 | perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator |
689 | | /* |
690 | | * Ensure each synthetic DSO has unique .text bytes. |
691 | | * |
692 | | * perf merges DSOs that share a build-id. Since trampolines can share |
693 | | * identical code and unwind bytes, perf may resolve all JIT frames to |
694 | | * the first symbol it saw (including entries from previous runs when |
695 | | * build-id caching is enabled). Patch a small marker in the emitted |
696 | | * bytes to make the build-id depend on a per-process salt and code id |
697 | | * without modifying the live code. |
698 | | */ |
699 | 0 | uint64_t marker = perf_jit_map_state.build_id_salt ^ |
700 | 0 | ((uint64_t)perf_jit_map_state.code_id << 32) ^ |
701 | 0 | (uint64_t)code_size; |
702 | 0 | if (size >= sizeof(marker)) { |
703 | 0 | size_t prefix = size - sizeof(marker); |
704 | 0 | perf_map_jit_write_fully((void *)(base), prefix); |
705 | 0 | perf_map_jit_write_fully(&marker, sizeof(marker)); |
706 | 0 | } |
707 | 0 | else if (size > 0) { |
708 | 0 | uint8_t tmp[sizeof(marker)]; |
709 | 0 | memcpy(tmp, (void *)(base), size); |
710 | 0 | for (size_t i = 0; i < size; i++) { |
711 | 0 | tmp[i] ^= (uint8_t)(marker >> (i * 8)); |
712 | 0 | } |
713 | 0 | perf_map_jit_write_fully(tmp, size); |
714 | 0 | } |
715 | | |
716 | | /* Clean up allocated memory */ |
717 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
718 | 0 | PyMem_RawFree(perf_map_entry); |
719 | 0 | } |
720 | | |
721 | | /* |
722 | | * Write a complete jitdump entry for a Python function |
723 | | * |
724 | | * This is the main function called by Python's trampoline system whenever |
725 | | * a new piece of JIT-compiled code needs to be recorded. It writes both |
726 | | * the unwinding information and the code load event to the jitdump file. |
727 | | * |
728 | | * The function performs these steps: |
729 | | * 1. Initialize jitdump system if not already done |
730 | | * 2. Extract function name and filename from Python code object |
731 | | * 3. Generate DWARF unwinding information |
732 | | * 4. Write unwinding info event to jitdump file |
733 | | * 5. Write code load event to jitdump file |
734 | | * |
735 | | * Args: |
736 | | * state: Jitdump state (currently unused, uses global state) |
737 | | * code_addr: Address where the compiled code resides |
738 | | * code_size: Size of the compiled code in bytes |
739 | | * co: Python code object containing metadata |
740 | | * |
741 | | * IMPORTANT: This function signature is part of Python's internal API |
742 | | * and must not be changed without coordinating with core Python development. |
743 | | */ |
744 | | static void perf_map_jit_write_entry(void *state, const void *code_addr, |
745 | | size_t code_size, PyCodeObject *co) |
746 | 0 | { |
747 | 0 | const char *entry = ""; |
748 | 0 | const char *filename = ""; |
749 | 0 | if (co != NULL) { |
750 | 0 | if (co->co_qualname != NULL) { |
751 | 0 | entry = PyUnicode_AsUTF8(co->co_qualname); |
752 | 0 | } |
753 | 0 | if (co->co_filename != NULL) { |
754 | 0 | filename = PyUnicode_AsUTF8(co->co_filename); |
755 | 0 | } |
756 | 0 | } |
757 | 0 | perf_map_jit_write_entry_with_name(state, code_addr, code_size, |
758 | 0 | entry, filename); |
759 | 0 | } |
760 | | |
761 | | void |
762 | | _PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size, |
763 | | const char *entry, const char *filename) |
764 | 0 | { |
765 | 0 | perf_map_jit_write_entry_with_name( |
766 | 0 | NULL, code_addr, code_size, entry, filename); |
767 | 0 | } |
768 | | |
769 | | // ============================================================================= |
770 | | // CLEANUP AND FINALIZATION |
771 | | // ============================================================================= |
772 | | |
773 | | /* |
774 | | * Finalize and cleanup the perf jitdump system |
775 | | * |
776 | | * This function is called when Python is shutting down or when the |
777 | | * perf trampoline system is being disabled. It ensures all resources |
778 | | * are properly released and all buffered data is flushed to disk. |
779 | | * |
780 | | * Args: |
781 | | * state: Jitdump state (currently unused, uses global state) |
782 | | * |
783 | | * Returns: 0 on success |
784 | | * |
785 | | * IMPORTANT: This function signature is part of Python's internal API |
786 | | * and must not be changed without coordinating with core Python development. |
787 | | */ |
788 | 0 | static int perf_map_jit_fini(void* state) { |
789 | | /* |
790 | | * Close jitdump file with proper synchronization |
791 | | * |
792 | | * We need to acquire the lock to ensure no other threads are |
793 | | * writing to the file when we close it. This prevents corruption |
794 | | * and ensures all data is properly flushed. |
795 | | */ |
796 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
797 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
798 | 0 | fclose(perf_jit_map_state.perf_map); // This also flushes buffers |
799 | 0 | perf_jit_map_state.perf_map = NULL; |
800 | 0 | } |
801 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
802 | | |
803 | | /* |
804 | | * Unmap the memory region |
805 | | * |
806 | | * This removes the signal to perf that we were generating JIT code. |
807 | | * After this point, perf will no longer detect this process as |
808 | | * having JIT capabilities. |
809 | | */ |
810 | 0 | if (perf_jit_map_state.mapped_buffer != NULL) { |
811 | 0 | munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
812 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
813 | 0 | } |
814 | | |
815 | | /* Clear global state reference */ |
816 | 0 | trampoline_api.state = NULL; |
817 | |
|
818 | 0 | return 0; // Success |
819 | 0 | } |
820 | | |
821 | | // ============================================================================= |
822 | | // PUBLIC API EXPORT |
823 | | // ============================================================================= |
824 | | |
825 | | /* |
826 | | * Python Perf Callbacks Structure |
827 | | * |
828 | | * This structure defines the callback interface that Python's trampoline |
829 | | * system uses to integrate with perf profiling. It contains function |
830 | | * pointers for initialization, event writing, and cleanup. |
831 | | * |
832 | | * CRITICAL: This structure and its contents are part of Python's internal |
833 | | * API. The function signatures and behavior must remain stable to maintain |
834 | | * compatibility with the Python interpreter's perf integration system. |
835 | | * |
836 | | * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h |
837 | | */ |
838 | | _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
839 | | &perf_map_jit_init, // Initialization function |
840 | | &perf_map_jit_write_entry, // Event writing function |
841 | | &perf_map_jit_fini, // Cleanup function |
842 | | }; |
843 | | |
844 | | #endif /* PY_HAVE_PERF_TRAMPOLINE */ |