/src/cpython/Python/perf_jit_trampoline.c
Line | Count | Source |
1 | | /* |
2 | | * Python Perf Trampoline Support - JIT Dump Implementation |
3 | | * |
4 | | * This file implements the perf jitdump API for Python's performance profiling |
5 | | * integration. It allows perf (Linux performance analysis tool) to understand |
6 | | * and profile dynamically generated Python bytecode by creating JIT dump files |
7 | | * that perf can inject into its analysis. |
8 | | * |
9 | | * |
10 | | * IMPORTANT: This file exports specific callback functions that are part of |
11 | | * Python's internal API. Do not modify the function signatures or behavior |
12 | | * of exported functions without coordinating with the Python core team. |
13 | | * |
14 | | * Usually the binary and libraries are mapped in separate region like below: |
15 | | * |
16 | | * address -> |
17 | | * --+---------------------+--//--+---------------------+-- |
18 | | * | .text | .data | ... | | .text | .data | ... | |
19 | | * --+---------------------+--//--+---------------------+-- |
20 | | * myprog libc.so |
21 | | * |
22 | | * So it'd be easy and straight-forward to find a mapped binary or library from an |
23 | | * address. |
24 | | * |
25 | | * But for JIT code, the code arena only cares about the code section. But the |
26 | | * resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
27 | | * unwind info too. Then it'd generate following address space with synthesized |
28 | | * MMAP events. Let's say it has a sample between address B and C. |
29 | | * |
30 | | * sample |
31 | | * | |
32 | | * address -> A B v C |
33 | | * --------------------------------------------------------------------------------------------------- |
34 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
35 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
36 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
37 | | * ... |
38 | | * --------------------------------------------------------------------------------------------------- |
39 | | * |
40 | | * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
41 | | * the unwind info. If it maps both .text section and unwind sections, the sample |
42 | | * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
43 | | * which one is right. So to make perf happy we have non-overlapping ranges for each |
44 | | * DSO: |
45 | | * |
46 | | * address -> |
47 | | * ------------------------------------------------------------------------------------------------------- |
48 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
49 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
50 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
51 | | * ... |
52 | | * ------------------------------------------------------------------------------------------------------- |
53 | | * |
54 | | * As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
55 | | * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
56 | | */ |
57 | | |
58 | | |
59 | | |
60 | | #include "Python.h" |
61 | | #include "pycore_ceval.h" // _PyPerf_Callbacks |
62 | | #include "pycore_frame.h" |
63 | | #include "pycore_interp.h" |
64 | | #include "pycore_mmap.h" // _PyAnnotateMemoryMap() |
65 | | #include "pycore_jit_unwind.h" |
66 | | #include "pycore_runtime.h" // _PyRuntime |
67 | | |
68 | | #ifdef PY_HAVE_PERF_TRAMPOLINE |
69 | | |
70 | | /* Standard library includes for perf jitdump implementation */ |
71 | | #if defined(__linux__) |
72 | | # include <elf.h> // ELF architecture constants |
73 | | #endif |
74 | | #include <fcntl.h> // File control operations |
75 | | #include <stdio.h> // Standard I/O operations |
76 | | #include <stdlib.h> // Standard library functions |
77 | | #include <string.h> // memcpy, strlen |
78 | | #include <sys/mman.h> // Memory mapping functions (mmap) |
79 | | #include <sys/types.h> // System data types |
80 | | #include <unistd.h> // System calls (sysconf, getpid) |
81 | | #include <sys/time.h> // Time functions (gettimeofday) |
82 | | #if defined(__linux__) |
83 | | # include <sys/syscall.h> // System call interface |
84 | | #endif |
85 | | |
86 | | // ============================================================================= |
87 | | // CONSTANTS AND CONFIGURATION |
88 | | // ============================================================================= |
89 | | |
90 | | /* |
91 | | * Memory layout considerations for perf jitdump: |
92 | | * |
93 | | * Perf expects non-overlapping memory regions for each JIT-compiled function. |
94 | | * When perf processes the jitdump file, it creates synthetic DSO (Dynamic |
95 | | * Shared Object) files that contain: |
96 | | * - ELF headers |
97 | | * - .text section (actual machine code) |
98 | | * - Unwind information (for stack traces) |
99 | | * |
100 | | * To ensure proper address space layout, we add padding between code regions. |
101 | | * This prevents address conflicts when perf maps the synthesized DSOs. |
102 | | * |
103 | | * Memory layout example: |
104 | | * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] |
105 | | * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] |
106 | | * |
107 | | * The padding size is now calculated automatically during initialization |
108 | | * based on the actual unwind information requirements. |
109 | | */ |
110 | | |
111 | | |
112 | | /* These constants are defined inside <elf.h>, which we can't use outside of linux. */ |
113 | | #if !defined(__linux__) |
114 | | # if defined(__i386__) || defined(_M_IX86) |
115 | | # define EM_386 3 |
116 | | # elif defined(__arm__) || defined(_M_ARM) |
117 | | # define EM_ARM 40 |
118 | | # elif defined(__x86_64__) || defined(_M_X64) |
119 | | # define EM_X86_64 62 |
120 | | # elif defined(__aarch64__) |
121 | | # define EM_AARCH64 183 |
122 | | # elif defined(__riscv) |
123 | | # define EM_RISCV 243 |
124 | | # endif |
125 | | #endif |
126 | | |
127 | | /* Convenient access to the global trampoline API state */ |
128 | 0 | #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
129 | | |
130 | | /* Type aliases for clarity and portability */ |
131 | | typedef uint64_t uword; // Word-sized unsigned integer |
132 | | typedef const char* CodeComments; // Code comment strings |
133 | | |
134 | | /* Memory size constants */ |
135 | 0 | #define MB (1024 * 1024) // 1 Megabyte for buffer sizing |
136 | | |
137 | | // ============================================================================= |
138 | | // ARCHITECTURE-SPECIFIC DEFINITIONS |
139 | | // ============================================================================= |
140 | | |
141 | | /* |
142 | | * Returns the ELF machine architecture constant for the current platform. |
143 | | * This is required for the jitdump header to correctly identify the target |
144 | | * architecture for perf processing. |
145 | | * |
146 | | */ |
147 | 0 | static uint64_t GetElfMachineArchitecture(void) { |
148 | 0 | #if defined(__x86_64__) || defined(_M_X64) |
149 | 0 | return EM_X86_64; |
150 | | #elif defined(__i386__) || defined(_M_IX86) |
151 | | return EM_386; |
152 | | #elif defined(__aarch64__) |
153 | | return EM_AARCH64; |
154 | | #elif defined(__arm__) || defined(_M_ARM) |
155 | | return EM_ARM; |
156 | | #elif defined(__riscv) |
157 | | return EM_RISCV; |
158 | | #else |
159 | | Py_UNREACHABLE(); // Unsupported architecture - should never reach here |
160 | | return 0; |
161 | | #endif |
162 | 0 | } |
163 | | |
164 | | // ============================================================================= |
165 | | // PERF JITDUMP DATA STRUCTURES |
166 | | // ============================================================================= |
167 | | |
168 | | /* |
169 | | * Perf jitdump file format structures |
170 | | * |
171 | | * These structures define the binary format that perf expects for JIT dump files. |
172 | | * The format is documented in the Linux perf tools source code and must match |
173 | | * exactly for proper perf integration. |
174 | | */ |
175 | | |
176 | | /* |
177 | | * Jitdump file header - written once at the beginning of each jitdump file |
178 | | * Contains metadata about the process and jitdump format version |
179 | | */ |
180 | | typedef struct { |
181 | | uint32_t magic; // Magic number (0x4A695444 = "JiTD") |
182 | | uint32_t version; // Jitdump format version (currently 1) |
183 | | uint32_t size; // Size of this header structure |
184 | | uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) |
185 | | uint32_t reserved; // Reserved field (must be 0) |
186 | | uint32_t process_id; // Process ID of the JIT compiler |
187 | | uint64_t time_stamp; // Timestamp when jitdump was created |
188 | | uint64_t flags; // Feature flags (currently unused) |
189 | | } Header; |
190 | | |
191 | | /* |
192 | | * Perf event types supported by the jitdump format |
193 | | * Each event type has a corresponding structure format |
194 | | */ |
195 | | enum PerfEvent { |
196 | | PerfLoad = 0, // Code load event (new JIT function) |
197 | | PerfMove = 1, // Code move event (function relocated) |
198 | | PerfDebugInfo = 2, // Debug information event |
199 | | PerfClose = 3, // JIT session close event |
200 | | PerfUnwindingInfo = 4 // Stack unwinding information event |
201 | | }; |
202 | | |
203 | | /* |
204 | | * Base event structure - common header for all perf events |
205 | | * Every event in the jitdump file starts with this structure |
206 | | */ |
207 | | struct BaseEvent { |
208 | | uint32_t event; // Event type (from PerfEvent enum) |
209 | | uint32_t size; // Total size of this event including payload |
210 | | uint64_t time_stamp; // Timestamp when event occurred |
211 | | }; |
212 | | |
213 | | /* |
214 | | * Code load event - indicates a new JIT-compiled function is available |
215 | | * This is the most important event type for Python profiling |
216 | | */ |
217 | | typedef struct { |
218 | | struct BaseEvent base; // Common event header |
219 | | uint32_t process_id; // Process ID where code was generated |
220 | | #if defined(__APPLE__) |
221 | | uint64_t thread_id; // Thread ID where code was generated |
222 | | #else |
223 | | uint32_t thread_id; // Thread ID where code was generated |
224 | | #endif |
225 | | uint64_t vma; // Virtual memory address where code is loaded |
226 | | uint64_t code_address; // Address of the actual machine code |
227 | | uint64_t code_size; // Size of the machine code in bytes |
228 | | uint64_t code_id; // Unique identifier for this code region |
229 | | /* Followed by: |
230 | | * - null-terminated function name string |
231 | | * - raw machine code bytes |
232 | | */ |
233 | | } CodeLoadEvent; |
234 | | |
235 | | /* |
236 | | * Code unwinding information event - provides DWARF data for stack traces |
237 | | * Essential for proper stack unwinding during profiling |
238 | | */ |
239 | | typedef struct { |
240 | | struct BaseEvent base; // Common event header |
241 | | uint64_t unwind_data_size; // Size of the unwinding data |
242 | | uint64_t eh_frame_hdr_size; // Size of the EH frame header |
243 | | uint64_t mapped_size; // Total mapped size (with padding) |
244 | | /* Followed by: |
245 | | * - EH frame header |
246 | | * - DWARF unwinding information |
247 | | * - Padding to alignment boundary |
248 | | */ |
249 | | } CodeUnwindingInfoEvent; |
250 | | |
251 | | /* |
252 | | * EH Frame Header structure for DWARF unwinding |
253 | | * |
254 | | * This header provides metadata about the .eh_frame data that follows. |
255 | | * It uses PC-relative and data-relative encodings to keep the synthesized |
256 | | * DSO self-contained when perf injects it. |
257 | | */ |
258 | | typedef struct __attribute__((packed)) { |
259 | | uint8_t version; |
260 | | uint8_t eh_frame_ptr_enc; |
261 | | uint8_t fde_count_enc; |
262 | | uint8_t table_enc; |
263 | | int32_t eh_frame_ptr; |
264 | | uint32_t eh_fde_count; |
265 | | int32_t from; |
266 | | int32_t to; |
267 | | } EhFrameHeader; |
268 | | _Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); |
269 | | |
270 | | // ============================================================================= |
271 | | // GLOBAL STATE MANAGEMENT |
272 | | // ============================================================================= |
273 | | |
274 | | /* |
275 | | * Global state for the perf jitdump implementation |
276 | | * |
277 | | * This structure maintains all the state needed for generating jitdump files. |
278 | | * It's designed as a singleton since there's typically only one jitdump file |
279 | | * per Python process. |
280 | | */ |
281 | | typedef struct { |
282 | | FILE* perf_map; // File handle for the jitdump file |
283 | | PyMutex map_lock; // Thread synchronization lock |
284 | | void* mapped_buffer; // Memory-mapped region (signals perf we're active) |
285 | | size_t mapped_size; // Size of the mapped region |
286 | | uint32_t code_id; // Counter for unique code region identifiers |
287 | | uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs |
288 | | } PerfMapJitState; |
289 | | |
290 | | /* Global singleton instance */ |
291 | | static PerfMapJitState perf_jit_map_state; |
292 | | |
293 | | // ============================================================================= |
294 | | // TIME UTILITIES |
295 | | // ============================================================================= |
296 | | |
297 | | /* Time conversion constant */ |
298 | | static const intptr_t nanoseconds_per_second = 1000000000; |
299 | | |
300 | | /* |
301 | | * Get current monotonic time in nanoseconds |
302 | | * |
303 | | * Monotonic time is preferred for event timestamps because it's not affected |
304 | | * by system clock adjustments. This ensures consistent timing relationships |
305 | | * between events even if the system clock is changed. |
306 | | * |
307 | | * Returns: Current monotonic time in nanoseconds since an arbitrary epoch |
308 | | */ |
309 | 0 | static int64_t get_current_monotonic_ticks(void) { |
310 | 0 | struct timespec ts; |
311 | 0 | if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
312 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
313 | 0 | return 0; |
314 | 0 | } |
315 | | |
316 | | /* Convert to nanoseconds for maximum precision */ |
317 | 0 | int64_t result = ts.tv_sec; |
318 | 0 | result *= nanoseconds_per_second; |
319 | 0 | result += ts.tv_nsec; |
320 | 0 | return result; |
321 | 0 | } |
322 | | |
323 | | /* |
324 | | * Get current wall clock time in microseconds |
325 | | * |
326 | | * Used for the jitdump file header timestamp. Unlike monotonic time, |
327 | | * this represents actual wall clock time that can be correlated with |
328 | | * other system events. |
329 | | * |
330 | | * Returns: Current time in microseconds since Unix epoch |
331 | | */ |
332 | 0 | static int64_t get_current_time_microseconds(void) { |
333 | 0 | struct timeval tv; |
334 | 0 | if (gettimeofday(&tv, NULL) < 0) { |
335 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
336 | 0 | return 0; |
337 | 0 | } |
338 | 0 | return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
339 | 0 | } |
340 | | |
341 | | // ============================================================================= |
342 | | // FILE I/O UTILITIES |
343 | | // ============================================================================= |
344 | | |
345 | | /* |
346 | | * Write data to the jitdump file with error handling |
347 | | * |
348 | | * This function ensures that all data is written to the file, handling |
349 | | * partial writes that can occur with large buffers or when the system |
350 | | * is under load. |
351 | | * |
352 | | * Args: |
353 | | * buffer: Pointer to data to write |
354 | | * size: Number of bytes to write |
355 | | */ |
356 | 0 | static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
357 | 0 | FILE* out_file = perf_jit_map_state.perf_map; |
358 | 0 | const char* ptr = (const char*)(buffer); |
359 | |
|
360 | 0 | while (size > 0) { |
361 | 0 | const size_t written = fwrite(ptr, 1, size, out_file); |
362 | 0 | if (written == 0) { |
363 | 0 | Py_UNREACHABLE(); // Write failure - should be very rare |
364 | 0 | break; |
365 | 0 | } |
366 | 0 | size -= written; |
367 | 0 | ptr += written; |
368 | 0 | } |
369 | 0 | } |
370 | | |
371 | | /* |
372 | | * Write the jitdump file header |
373 | | * |
374 | | * The header must be written exactly once at the beginning of each jitdump |
375 | | * file. It provides metadata that perf uses to parse the rest of the file. |
376 | | * |
377 | | * Args: |
378 | | * pid: Process ID to include in the header |
379 | | * out_file: File handle to write to (currently unused, uses global state) |
380 | | */ |
381 | 0 | static void perf_map_jit_write_header(int pid, FILE* out_file) { |
382 | 0 | Header header; |
383 | | |
384 | | /* Initialize header with required values */ |
385 | 0 | header.magic = 0x4A695444; // "JiTD" magic number |
386 | 0 | header.version = 1; // Current jitdump version |
387 | 0 | header.size = sizeof(Header); // Header size for validation |
388 | 0 | header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture |
389 | 0 | header.reserved = 0; // padding reserved for future use |
390 | 0 | header.process_id = pid; // Process identifier |
391 | 0 | header.time_stamp = get_current_time_microseconds(); // Creation time |
392 | 0 | header.flags = 0; // No special flags currently used |
393 | |
|
394 | 0 | perf_map_jit_write_fully(&header, sizeof(header)); |
395 | 0 | } |
396 | | |
397 | | // ============================================================================= |
398 | | // JITDUMP INITIALIZATION |
399 | | // ============================================================================= |
400 | | |
401 | | /* |
402 | | * Initialize the perf jitdump interface |
403 | | * |
404 | | * This function sets up everything needed to generate jitdump files: |
405 | | * 1. Creates the jitdump file with a unique name |
406 | | * 2. Maps the first page to signal perf that we're using the interface |
407 | | * 3. Writes the jitdump header |
408 | | * 4. Initializes synchronization primitives |
409 | | * |
410 | | * The memory mapping is crucial - perf detects jitdump files by scanning |
411 | | * for processes that have mapped files matching the pattern /tmp/jit-*.dump |
412 | | * |
413 | | * Returns: Pointer to initialized state, or NULL on failure |
414 | | */ |
415 | 0 | static void* perf_map_jit_init(void) { |
416 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
417 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
418 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
419 | 0 | return &perf_jit_map_state; |
420 | 0 | } |
421 | | |
422 | 0 | char filename[100]; |
423 | 0 | int pid = getpid(); |
424 | | |
425 | | /* Create unique filename based on process ID */ |
426 | 0 | snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
427 | | |
428 | | /* Create/open the jitdump file with appropriate permissions */ |
429 | 0 | const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
430 | 0 | if (fd == -1) { |
431 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
432 | 0 | return NULL; // Failed to create file |
433 | 0 | } |
434 | | |
435 | | /* Get system page size for memory mapping */ |
436 | 0 | const long page_size = sysconf(_SC_PAGESIZE); |
437 | 0 | if (page_size == -1) { |
438 | 0 | close(fd); |
439 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
440 | 0 | return NULL; // Failed to get page size |
441 | 0 | } |
442 | | |
443 | | #if defined(__APPLE__) |
444 | | // On macOS, samply uses a preload to find jitdumps and this mmap can be slow. |
445 | | perf_jit_map_state.mapped_buffer = NULL; |
446 | | #else |
447 | | /* |
448 | | * Map the first page of the jitdump file |
449 | | * |
450 | | * This memory mapping serves as a signal to perf that this process |
451 | | * is generating JIT code. Perf scans /proc/.../maps looking for mapped |
452 | | * files that match the jitdump naming pattern. |
453 | | * |
454 | | * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. |
455 | | */ |
456 | 0 | perf_jit_map_state.mapped_buffer = mmap( |
457 | 0 | NULL, // Let kernel choose address |
458 | 0 | page_size, // Map one page |
459 | 0 | PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) |
460 | 0 | MAP_PRIVATE, // Private mapping |
461 | 0 | fd, // File descriptor |
462 | 0 | 0 // Offset 0 (first page) |
463 | 0 | ); |
464 | |
|
465 | 0 | if (perf_jit_map_state.mapped_buffer == MAP_FAILED) { |
466 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
467 | 0 | close(fd); |
468 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
469 | 0 | return NULL; // Memory mapping failed |
470 | 0 | } |
471 | 0 | (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size, |
472 | 0 | "cpython:perf_jit_trampoline"); |
473 | 0 | #endif |
474 | |
|
475 | 0 | perf_jit_map_state.mapped_size = page_size; |
476 | | |
477 | | /* Convert file descriptor to FILE* for easier I/O operations */ |
478 | 0 | perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
479 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
480 | 0 | close(fd); |
481 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
482 | 0 | return NULL; // Failed to create FILE* |
483 | 0 | } |
484 | | |
485 | | /* |
486 | | * Set up file buffering for better performance |
487 | | * |
488 | | * We use a large buffer (2MB) because jitdump files can be written |
489 | | * frequently during program execution. Buffering reduces system call |
490 | | * overhead and improves overall performance. |
491 | | */ |
492 | 0 | setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
493 | | |
494 | | /* Write the jitdump file header */ |
495 | 0 | perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
496 | | |
497 | | /* Initialize code ID counter */ |
498 | 0 | perf_jit_map_state.code_id = 0; |
499 | 0 | perf_jit_map_state.build_id_salt = |
500 | 0 | ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); |
501 | | |
502 | | /* Calculate padding size based on actual unwind info requirements */ |
503 | 0 | size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0); |
504 | 0 | size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
505 | 0 | trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); |
506 | 0 | trampoline_api.code_alignment = 32; |
507 | |
|
508 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
509 | 0 | return &perf_jit_map_state; |
510 | 0 | } |
511 | | |
512 | | // ============================================================================= |
513 | | // MAIN JITDUMP ENTRY WRITING |
514 | | // ============================================================================= |
515 | | |
516 | | /* |
517 | | * Write a complete jitdump entry for a code region with a provided name. |
518 | | * |
519 | | * This shares the same implementation as the trampoline callback, but |
520 | | * allows callers that don't have a PyCodeObject to reuse the jitdump |
521 | | * infrastructure. |
522 | | */ |
523 | | static void perf_map_jit_write_entry_with_name( |
524 | | void *state, |
525 | | const void *code_addr, |
526 | | size_t code_size, |
527 | | const char *entry, |
528 | | const char *filename |
529 | | ) |
530 | 0 | { |
531 | | /* Initialize jitdump system on first use */ |
532 | 0 | void* ret = perf_map_jit_init(); |
533 | 0 | if (ret == NULL) { |
534 | 0 | return; // Initialization failed, silently abort |
535 | 0 | } |
536 | | |
537 | 0 | if (entry == NULL) { |
538 | 0 | entry = ""; |
539 | 0 | } |
540 | 0 | if (filename == NULL) { |
541 | 0 | filename = ""; |
542 | 0 | } |
543 | | |
544 | | /* |
545 | | * Create formatted function name for perf display |
546 | | * |
547 | | * Format: "py::<function_name>:<filename>" |
548 | | * The "py::" prefix helps identify Python functions in mixed-language |
549 | | * profiles (e.g., when profiling C extensions alongside Python code). |
550 | | */ |
551 | 0 | size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
552 | 0 | char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
553 | 0 | if (perf_map_entry == NULL) { |
554 | 0 | return; // Memory allocation failed |
555 | 0 | } |
556 | 0 | snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
557 | |
|
558 | 0 | const size_t name_length = strlen(perf_map_entry); |
559 | 0 | uword base = (uword)code_addr; |
560 | 0 | uword size = code_size; |
561 | | |
562 | | /* |
563 | | * Generate DWARF unwinding information |
564 | | * |
565 | | * DWARF data is essential for proper stack unwinding during profiling. |
566 | | * Without it, perf cannot generate accurate call graphs, especially |
567 | | * in optimized code where frame pointers may be omitted. |
568 | | */ |
569 | 0 | uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
570 | 0 | size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( |
571 | 0 | buffer, sizeof(buffer), code_addr, code_size, 0); |
572 | 0 | if (eh_frame_size == 0) { |
573 | 0 | PyMem_RawFree(perf_map_entry); |
574 | 0 | return; |
575 | 0 | } |
576 | | |
577 | | /* |
578 | | * A logical jitdump entry is written as multiple records and also consumes |
579 | | * a process-global code_id. Serialize the whole sequence so concurrent JIT |
580 | | * compilation cannot interleave records or reuse an ID. |
581 | | */ |
582 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
583 | | |
584 | | /* |
585 | | * Write Code Unwinding Information Event |
586 | | * |
587 | | * This event must be written before the code load event to ensure |
588 | | * perf has the unwinding information available when it processes |
589 | | * the code region. |
590 | | */ |
591 | 0 | CodeUnwindingInfoEvent ev2; |
592 | 0 | ev2.base.event = PerfUnwindingInfo; |
593 | 0 | ev2.base.time_stamp = get_current_monotonic_ticks(); |
594 | 0 | ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
595 | | |
596 | | /* Verify we don't exceed our padding budget */ |
597 | 0 | assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); |
598 | |
|
599 | 0 | ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
600 | 0 | ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment |
601 | | |
602 | | /* Calculate total event size with padding */ |
603 | 0 | int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); |
604 | 0 | int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align |
605 | 0 | ev2.base.size = (uint32_t)(content_size + padding_size); |
606 | | |
607 | | /* Write the unwinding info event header */ |
608 | 0 | perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
609 | | |
610 | | /* |
611 | | * Write EH Frame Header |
612 | | * |
613 | | * The EH frame header provides metadata about the DWARF unwinding |
614 | | * information that follows. It includes pointers and counts that |
615 | | * help perf navigate the unwinding data efficiently. |
616 | | */ |
617 | 0 | EhFrameHeader f; |
618 | 0 | f.version = 1; |
619 | 0 | f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel; |
620 | 0 | f.fde_count_enc = DWRF_EH_PE_udata4; |
621 | 0 | f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel; |
622 | | |
623 | | /* Calculate relative offsets for EH frame navigation */ |
624 | 0 | f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); |
625 | 0 | f.eh_fde_count = 1; // We generate exactly one FDE per function |
626 | 0 | f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); |
627 | 0 | uint32_t cie_payload_size; |
628 | 0 | memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); |
629 | 0 | int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); |
630 | 0 | f.to = -(int32_t)(eh_frame_size - cie_size); |
631 | | |
632 | | /* Write EH frame data and header */ |
633 | 0 | perf_map_jit_write_fully(buffer, eh_frame_size); |
634 | 0 | perf_map_jit_write_fully(&f, sizeof(f)); |
635 | | |
636 | | /* Write padding to maintain alignment */ |
637 | 0 | char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
638 | 0 | perf_map_jit_write_fully(&padding_bytes, padding_size); |
639 | | |
640 | | /* |
641 | | * Write Code Load Event |
642 | | * |
643 | | * This event tells perf about the new code region. It includes: |
644 | | * - Memory addresses and sizes |
645 | | * - Process and thread identification |
646 | | * - Function name for symbol resolution |
647 | | * - The actual machine code bytes |
648 | | */ |
649 | 0 | CodeLoadEvent ev; |
650 | 0 | ev.base.event = PerfLoad; |
651 | 0 | ev.base.size = sizeof(ev) + (name_length+1) + size; |
652 | 0 | ev.base.time_stamp = get_current_monotonic_ticks(); |
653 | 0 | ev.process_id = getpid(); |
654 | | #if defined(__APPLE__) |
655 | | pthread_threadid_np(NULL, &ev.thread_id); |
656 | | #else |
657 | 0 | ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call |
658 | 0 | #endif |
659 | 0 | ev.vma = base; // Virtual memory address |
660 | 0 | ev.code_address = base; // Same as VMA for our use case |
661 | 0 | ev.code_size = size; |
662 | | |
663 | | /* Assign unique code ID and increment counter */ |
664 | 0 | perf_jit_map_state.code_id += 1; |
665 | 0 | ev.code_id = perf_jit_map_state.code_id; |
666 | | |
667 | | /* Write code load event and associated data */ |
668 | 0 | perf_map_jit_write_fully(&ev, sizeof(ev)); |
669 | 0 | perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator |
670 | | /* |
671 | | * Ensure each synthetic DSO has unique .text bytes. |
672 | | * |
673 | | * perf merges DSOs that share a build-id. Since trampolines can share |
674 | | * identical code and unwind bytes, perf may resolve all JIT frames to |
675 | | * the first symbol it saw (including entries from previous runs when |
676 | | * build-id caching is enabled). Patch a small marker in the emitted |
677 | | * bytes to make the build-id depend on a per-process salt and code id |
678 | | * without modifying the live code. |
679 | | */ |
680 | 0 | uint64_t marker = perf_jit_map_state.build_id_salt ^ |
681 | 0 | ((uint64_t)perf_jit_map_state.code_id << 32) ^ |
682 | 0 | (uint64_t)code_size; |
683 | 0 | if (size >= sizeof(marker)) { |
684 | 0 | size_t prefix = size - sizeof(marker); |
685 | 0 | perf_map_jit_write_fully((void *)(base), prefix); |
686 | 0 | perf_map_jit_write_fully(&marker, sizeof(marker)); |
687 | 0 | } |
688 | 0 | else if (size > 0) { |
689 | 0 | uint8_t tmp[sizeof(marker)]; |
690 | 0 | memcpy(tmp, (void *)(base), size); |
691 | 0 | for (size_t i = 0; i < size; i++) { |
692 | 0 | tmp[i] ^= (uint8_t)(marker >> (i * 8)); |
693 | 0 | } |
694 | 0 | perf_map_jit_write_fully(tmp, size); |
695 | 0 | } |
696 | | |
697 | | /* Clean up allocated memory */ |
698 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
699 | 0 | PyMem_RawFree(perf_map_entry); |
700 | 0 | } |
701 | | |
702 | | /* |
703 | | * Write a complete jitdump entry for a Python function |
704 | | * |
705 | | * This is the main function called by Python's trampoline system whenever |
706 | | * a new piece of JIT-compiled code needs to be recorded. It writes both |
707 | | * the unwinding information and the code load event to the jitdump file. |
708 | | * |
709 | | * The function performs these steps: |
710 | | * 1. Initialize jitdump system if not already done |
711 | | * 2. Extract function name and filename from Python code object |
712 | | * 3. Generate DWARF unwinding information |
713 | | * 4. Write unwinding info event to jitdump file |
714 | | * 5. Write code load event to jitdump file |
715 | | * |
716 | | * Args: |
717 | | * state: Jitdump state (currently unused, uses global state) |
718 | | * code_addr: Address where the compiled code resides |
719 | | * code_size: Size of the compiled code in bytes |
720 | | * co: Python code object containing metadata |
721 | | * |
722 | | * IMPORTANT: This function signature is part of Python's internal API |
723 | | * and must not be changed without coordinating with core Python development. |
724 | | */ |
725 | | static void perf_map_jit_write_entry(void *state, const void *code_addr, |
726 | | size_t code_size, PyCodeObject *co) |
727 | 0 | { |
728 | 0 | const char *entry = ""; |
729 | 0 | const char *filename = ""; |
730 | 0 | if (co != NULL) { |
731 | 0 | if (co->co_qualname != NULL) { |
732 | 0 | entry = PyUnicode_AsUTF8(co->co_qualname); |
733 | 0 | } |
734 | 0 | if (co->co_filename != NULL) { |
735 | 0 | filename = PyUnicode_AsUTF8(co->co_filename); |
736 | 0 | } |
737 | 0 | } |
738 | 0 | perf_map_jit_write_entry_with_name(state, code_addr, code_size, |
739 | 0 | entry, filename); |
740 | 0 | } |
741 | | |
742 | | void |
743 | | _PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size, |
744 | | const char *entry, const char *filename) |
745 | 0 | { |
746 | 0 | perf_map_jit_write_entry_with_name( |
747 | 0 | NULL, code_addr, code_size, entry, filename); |
748 | 0 | } |
749 | | |
750 | | // ============================================================================= |
751 | | // CLEANUP AND FINALIZATION |
752 | | // ============================================================================= |
753 | | |
754 | | /* |
755 | | * Finalize and cleanup the perf jitdump system |
756 | | * |
757 | | * This function is called when Python is shutting down or when the |
758 | | * perf trampoline system is being disabled. It ensures all resources |
759 | | * are properly released and all buffered data is flushed to disk. |
760 | | * |
761 | | * Args: |
762 | | * state: Jitdump state (currently unused, uses global state) |
763 | | * |
764 | | * Returns: 0 on success |
765 | | * |
766 | | * IMPORTANT: This function signature is part of Python's internal API |
767 | | * and must not be changed without coordinating with core Python development. |
768 | | */ |
769 | 0 | static int perf_map_jit_fini(void* state) { |
770 | | /* |
771 | | * Close jitdump file with proper synchronization |
772 | | * |
773 | | * We need to acquire the lock to ensure no other threads are |
774 | | * writing to the file when we close it. This prevents corruption |
775 | | * and ensures all data is properly flushed. |
776 | | */ |
777 | 0 | PyMutex_Lock(&perf_jit_map_state.map_lock); |
778 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
779 | 0 | fclose(perf_jit_map_state.perf_map); // This also flushes buffers |
780 | 0 | perf_jit_map_state.perf_map = NULL; |
781 | 0 | } |
782 | 0 | PyMutex_Unlock(&perf_jit_map_state.map_lock); |
783 | | |
784 | | /* |
785 | | * Unmap the memory region |
786 | | * |
787 | | * This removes the signal to perf that we were generating JIT code. |
788 | | * After this point, perf will no longer detect this process as |
789 | | * having JIT capabilities. |
790 | | */ |
791 | 0 | if (perf_jit_map_state.mapped_buffer != NULL) { |
792 | 0 | munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
793 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
794 | 0 | } |
795 | | |
796 | | /* Clear global state reference */ |
797 | 0 | trampoline_api.state = NULL; |
798 | |
|
799 | 0 | return 0; // Success |
800 | 0 | } |
801 | | |
802 | | // ============================================================================= |
803 | | // PUBLIC API EXPORT |
804 | | // ============================================================================= |
805 | | |
806 | | /* |
807 | | * Python Perf Callbacks Structure |
808 | | * |
809 | | * This structure defines the callback interface that Python's trampoline |
810 | | * system uses to integrate with perf profiling. It contains function |
811 | | * pointers for initialization, event writing, and cleanup. |
812 | | * |
813 | | * CRITICAL: This structure and its contents are part of Python's internal |
814 | | * API. The function signatures and behavior must remain stable to maintain |
815 | | * compatibility with the Python interpreter's perf integration system. |
816 | | * |
817 | | * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h |
818 | | */ |
819 | | _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
820 | | &perf_map_jit_init, // Initialization function |
821 | | &perf_map_jit_write_entry, // Event writing function |
822 | | &perf_map_jit_fini, // Cleanup function |
823 | | }; |
824 | | |
825 | | #endif /* PY_HAVE_PERF_TRAMPOLINE */ |