/src/cpython/Python/perf_jit_trampoline.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Python Perf Trampoline Support - JIT Dump Implementation |
3 | | * |
4 | | * This file implements the perf jitdump API for Python's performance profiling |
5 | | * integration. It allows perf (Linux performance analysis tool) to understand |
6 | | * and profile dynamically generated Python bytecode by creating JIT dump files |
7 | | * that perf can inject into its analysis. |
8 | | * |
9 | | * |
10 | | * IMPORTANT: This file exports specific callback functions that are part of |
11 | | * Python's internal API. Do not modify the function signatures or behavior |
12 | | * of exported functions without coordinating with the Python core team. |
13 | | * |
14 | | * Usually the binary and libraries are mapped in separate region like below: |
15 | | * |
16 | | * address -> |
17 | | * --+---------------------+--//--+---------------------+-- |
18 | | * | .text | .data | ... | | .text | .data | ... | |
19 | | * --+---------------------+--//--+---------------------+-- |
20 | | * myprog libc.so |
21 | | * |
22 | | * So it'd be easy and straight-forward to find a mapped binary or library from an |
23 | | * address. |
24 | | * |
25 | | * But for JIT code, the code arena only cares about the code section. But the |
26 | | * resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
27 | | * unwind info too. Then it'd generate following address space with synthesized |
28 | | * MMAP events. Let's say it has a sample between address B and C. |
29 | | * |
30 | | * sample |
31 | | * | |
32 | | * address -> A B v C |
33 | | * --------------------------------------------------------------------------------------------------- |
34 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
35 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
36 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
37 | | * ... |
38 | | * --------------------------------------------------------------------------------------------------- |
39 | | * |
40 | | * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
41 | | * the unwind info. If it maps both .text section and unwind sections, the sample |
42 | | * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
43 | | * which one is right. So to make perf happy we have non-overlapping ranges for each |
44 | | * DSO: |
45 | | * |
46 | | * address -> |
47 | | * ------------------------------------------------------------------------------------------------------- |
48 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
49 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
50 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
51 | | * ... |
52 | | * ------------------------------------------------------------------------------------------------------- |
53 | | * |
54 | | * As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
55 | | * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
56 | | */ |
57 | | |
58 | | |
59 | | |
60 | | #include "Python.h" |
61 | | #include "pycore_ceval.h" // _PyPerf_Callbacks |
62 | | #include "pycore_frame.h" |
63 | | #include "pycore_interp.h" |
64 | | #include "pycore_runtime.h" // _PyRuntime |
65 | | |
66 | | #ifdef PY_HAVE_PERF_TRAMPOLINE |
67 | | |
68 | | /* Standard library includes for perf jitdump implementation */ |
69 | | #include <elf.h> // ELF architecture constants |
70 | | #include <fcntl.h> // File control operations |
71 | | #include <stdio.h> // Standard I/O operations |
72 | | #include <stdlib.h> // Standard library functions |
73 | | #include <sys/mman.h> // Memory mapping functions (mmap) |
74 | | #include <sys/types.h> // System data types |
75 | | #include <unistd.h> // System calls (sysconf, getpid) |
76 | | #include <sys/time.h> // Time functions (gettimeofday) |
77 | | #include <sys/syscall.h> // System call interface |
78 | | |
79 | | // ============================================================================= |
80 | | // CONSTANTS AND CONFIGURATION |
81 | | // ============================================================================= |
82 | | |
83 | | /* |
84 | | * Memory layout considerations for perf jitdump: |
85 | | * |
86 | | * Perf expects non-overlapping memory regions for each JIT-compiled function. |
87 | | * When perf processes the jitdump file, it creates synthetic DSO (Dynamic |
88 | | * Shared Object) files that contain: |
89 | | * - ELF headers |
90 | | * - .text section (actual machine code) |
91 | | * - Unwind information (for stack traces) |
92 | | * |
93 | | * To ensure proper address space layout, we add padding between code regions. |
94 | | * This prevents address conflicts when perf maps the synthesized DSOs. |
95 | | * |
96 | | * Memory layout example: |
97 | | * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] |
98 | | * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] |
99 | | * |
100 | | * The padding size (0x100) is chosen to accommodate typical unwind info sizes |
101 | | * while maintaining 16-byte alignment requirements. |
102 | | */ |
103 | 0 | #define PERF_JIT_CODE_PADDING 0x100 |
104 | | |
105 | | /* Convenient access to the global trampoline API state */ |
106 | 0 | #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
107 | | |
108 | | /* Type aliases for clarity and portability */ |
109 | | typedef uint64_t uword; // Word-sized unsigned integer |
110 | | typedef const char* CodeComments; // Code comment strings |
111 | | |
112 | | /* Memory size constants */ |
113 | 0 | #define MB (1024 * 1024) // 1 Megabyte for buffer sizing |
114 | | |
115 | | // ============================================================================= |
116 | | // ARCHITECTURE-SPECIFIC DEFINITIONS |
117 | | // ============================================================================= |
118 | | |
119 | | /* |
120 | | * Returns the ELF machine architecture constant for the current platform. |
121 | | * This is required for the jitdump header to correctly identify the target |
122 | | * architecture for perf processing. |
123 | | * |
124 | | */ |
125 | 0 | static uint64_t GetElfMachineArchitecture(void) { |
126 | 0 | #if defined(__x86_64__) || defined(_M_X64) |
127 | 0 | return EM_X86_64; |
128 | | #elif defined(__i386__) || defined(_M_IX86) |
129 | | return EM_386; |
130 | | #elif defined(__aarch64__) |
131 | | return EM_AARCH64; |
132 | | #elif defined(__arm__) || defined(_M_ARM) |
133 | | return EM_ARM; |
134 | | #elif defined(__riscv) |
135 | | return EM_RISCV; |
136 | | #else |
137 | | Py_UNREACHABLE(); // Unsupported architecture - should never reach here |
138 | | return 0; |
139 | | #endif |
140 | 0 | } |
141 | | |
142 | | // ============================================================================= |
143 | | // PERF JITDUMP DATA STRUCTURES |
144 | | // ============================================================================= |
145 | | |
146 | | /* |
147 | | * Perf jitdump file format structures |
148 | | * |
149 | | * These structures define the binary format that perf expects for JIT dump files. |
150 | | * The format is documented in the Linux perf tools source code and must match |
151 | | * exactly for proper perf integration. |
152 | | */ |
153 | | |
154 | | /* |
155 | | * Jitdump file header - written once at the beginning of each jitdump file |
156 | | * Contains metadata about the process and jitdump format version |
157 | | */ |
158 | | typedef struct { |
159 | | uint32_t magic; // Magic number (0x4A695444 = "JiTD") |
160 | | uint32_t version; // Jitdump format version (currently 1) |
161 | | uint32_t size; // Size of this header structure |
162 | | uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) |
163 | | uint32_t reserved; // Reserved field (must be 0) |
164 | | uint32_t process_id; // Process ID of the JIT compiler |
165 | | uint64_t time_stamp; // Timestamp when jitdump was created |
166 | | uint64_t flags; // Feature flags (currently unused) |
167 | | } Header; |
168 | | |
169 | | /* |
170 | | * Perf event types supported by the jitdump format |
171 | | * Each event type has a corresponding structure format |
172 | | */ |
173 | | enum PerfEvent { |
174 | | PerfLoad = 0, // Code load event (new JIT function) |
175 | | PerfMove = 1, // Code move event (function relocated) |
176 | | PerfDebugInfo = 2, // Debug information event |
177 | | PerfClose = 3, // JIT session close event |
178 | | PerfUnwindingInfo = 4 // Stack unwinding information event |
179 | | }; |
180 | | |
181 | | /* |
182 | | * Base event structure - common header for all perf events |
183 | | * Every event in the jitdump file starts with this structure |
184 | | */ |
185 | | struct BaseEvent { |
186 | | uint32_t event; // Event type (from PerfEvent enum) |
187 | | uint32_t size; // Total size of this event including payload |
188 | | uint64_t time_stamp; // Timestamp when event occurred |
189 | | }; |
190 | | |
191 | | /* |
192 | | * Code load event - indicates a new JIT-compiled function is available |
193 | | * This is the most important event type for Python profiling |
194 | | */ |
195 | | typedef struct { |
196 | | struct BaseEvent base; // Common event header |
197 | | uint32_t process_id; // Process ID where code was generated |
198 | | uint32_t thread_id; // Thread ID where code was generated |
199 | | uint64_t vma; // Virtual memory address where code is loaded |
200 | | uint64_t code_address; // Address of the actual machine code |
201 | | uint64_t code_size; // Size of the machine code in bytes |
202 | | uint64_t code_id; // Unique identifier for this code region |
203 | | /* Followed by: |
204 | | * - null-terminated function name string |
205 | | * - raw machine code bytes |
206 | | */ |
207 | | } CodeLoadEvent; |
208 | | |
209 | | /* |
210 | | * Code unwinding information event - provides DWARF data for stack traces |
211 | | * Essential for proper stack unwinding during profiling |
212 | | */ |
213 | | typedef struct { |
214 | | struct BaseEvent base; // Common event header |
215 | | uint64_t unwind_data_size; // Size of the unwinding data |
216 | | uint64_t eh_frame_hdr_size; // Size of the EH frame header |
217 | | uint64_t mapped_size; // Total mapped size (with padding) |
218 | | /* Followed by: |
219 | | * - EH frame header |
220 | | * - DWARF unwinding information |
221 | | * - Padding to alignment boundary |
222 | | */ |
223 | | } CodeUnwindingInfoEvent; |
224 | | |
225 | | // ============================================================================= |
226 | | // GLOBAL STATE MANAGEMENT |
227 | | // ============================================================================= |
228 | | |
229 | | /* |
230 | | * Global state for the perf jitdump implementation |
231 | | * |
232 | | * This structure maintains all the state needed for generating jitdump files. |
233 | | * It's designed as a singleton since there's typically only one jitdump file |
234 | | * per Python process. |
235 | | */ |
236 | | typedef struct { |
237 | | FILE* perf_map; // File handle for the jitdump file |
238 | | PyThread_type_lock map_lock; // Thread synchronization lock |
239 | | void* mapped_buffer; // Memory-mapped region (signals perf we're active) |
240 | | size_t mapped_size; // Size of the mapped region |
241 | | int code_id; // Counter for unique code region identifiers |
242 | | } PerfMapJitState; |
243 | | |
244 | | /* Global singleton instance */ |
245 | | static PerfMapJitState perf_jit_map_state; |
246 | | |
247 | | // ============================================================================= |
248 | | // TIME UTILITIES |
249 | | // ============================================================================= |
250 | | |
251 | | /* Time conversion constant */ |
252 | | static const intptr_t nanoseconds_per_second = 1000000000; |
253 | | |
254 | | /* |
255 | | * Get current monotonic time in nanoseconds |
256 | | * |
257 | | * Monotonic time is preferred for event timestamps because it's not affected |
258 | | * by system clock adjustments. This ensures consistent timing relationships |
259 | | * between events even if the system clock is changed. |
260 | | * |
261 | | * Returns: Current monotonic time in nanoseconds since an arbitrary epoch |
262 | | */ |
263 | 0 | static int64_t get_current_monotonic_ticks(void) { |
264 | 0 | struct timespec ts; |
265 | 0 | if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
266 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
267 | 0 | return 0; |
268 | 0 | } |
269 | | |
270 | | /* Convert to nanoseconds for maximum precision */ |
271 | 0 | int64_t result = ts.tv_sec; |
272 | 0 | result *= nanoseconds_per_second; |
273 | 0 | result += ts.tv_nsec; |
274 | 0 | return result; |
275 | 0 | } |
276 | | |
277 | | /* |
278 | | * Get current wall clock time in microseconds |
279 | | * |
280 | | * Used for the jitdump file header timestamp. Unlike monotonic time, |
281 | | * this represents actual wall clock time that can be correlated with |
282 | | * other system events. |
283 | | * |
284 | | * Returns: Current time in microseconds since Unix epoch |
285 | | */ |
286 | 0 | static int64_t get_current_time_microseconds(void) { |
287 | 0 | struct timeval tv; |
288 | 0 | if (gettimeofday(&tv, NULL) < 0) { |
289 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
290 | 0 | return 0; |
291 | 0 | } |
292 | 0 | return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
293 | 0 | } |
294 | | |
295 | | // ============================================================================= |
296 | | // UTILITY FUNCTIONS |
297 | | // ============================================================================= |
298 | | |
299 | | /* |
300 | | * Round up a value to the next multiple of a given number |
301 | | * |
302 | | * This is essential for maintaining proper alignment requirements in the |
303 | | * jitdump format. Many structures need to be aligned to specific boundaries |
304 | | * (typically 8 or 16 bytes) for efficient processing by perf. |
305 | | * |
306 | | * Args: |
307 | | * value: The value to round up |
308 | | * multiple: The multiple to round up to |
309 | | * |
310 | | * Returns: The smallest value >= input that is a multiple of 'multiple' |
311 | | */ |
312 | 0 | static size_t round_up(int64_t value, int64_t multiple) { |
313 | 0 | if (multiple == 0) { |
314 | 0 | return value; // Avoid division by zero |
315 | 0 | } |
316 | | |
317 | 0 | int64_t remainder = value % multiple; |
318 | 0 | if (remainder == 0) { |
319 | 0 | return value; // Already aligned |
320 | 0 | } |
321 | | |
322 | | /* Calculate how much to add to reach the next multiple */ |
323 | 0 | int64_t difference = multiple - remainder; |
324 | 0 | int64_t rounded_up_value = value + difference; |
325 | |
|
326 | 0 | return rounded_up_value; |
327 | 0 | } |
328 | | |
329 | | // ============================================================================= |
330 | | // FILE I/O UTILITIES |
331 | | // ============================================================================= |
332 | | |
333 | | /* |
334 | | * Write data to the jitdump file with error handling |
335 | | * |
336 | | * This function ensures that all data is written to the file, handling |
337 | | * partial writes that can occur with large buffers or when the system |
338 | | * is under load. |
339 | | * |
340 | | * Args: |
341 | | * buffer: Pointer to data to write |
342 | | * size: Number of bytes to write |
343 | | */ |
344 | 0 | static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
345 | 0 | FILE* out_file = perf_jit_map_state.perf_map; |
346 | 0 | const char* ptr = (const char*)(buffer); |
347 | |
|
348 | 0 | while (size > 0) { |
349 | 0 | const size_t written = fwrite(ptr, 1, size, out_file); |
350 | 0 | if (written == 0) { |
351 | 0 | Py_UNREACHABLE(); // Write failure - should be very rare |
352 | 0 | break; |
353 | 0 | } |
354 | 0 | size -= written; |
355 | 0 | ptr += written; |
356 | 0 | } |
357 | 0 | } |
358 | | |
359 | | /* |
360 | | * Write the jitdump file header |
361 | | * |
362 | | * The header must be written exactly once at the beginning of each jitdump |
363 | | * file. It provides metadata that perf uses to parse the rest of the file. |
364 | | * |
365 | | * Args: |
366 | | * pid: Process ID to include in the header |
367 | | * out_file: File handle to write to (currently unused, uses global state) |
368 | | */ |
369 | 0 | static void perf_map_jit_write_header(int pid, FILE* out_file) { |
370 | 0 | Header header; |
371 | | |
372 | | /* Initialize header with required values */ |
373 | 0 | header.magic = 0x4A695444; // "JiTD" magic number |
374 | 0 | header.version = 1; // Current jitdump version |
375 | 0 | header.size = sizeof(Header); // Header size for validation |
376 | 0 | header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture |
377 | 0 | header.process_id = pid; // Process identifier |
378 | 0 | header.time_stamp = get_current_time_microseconds(); // Creation time |
379 | 0 | header.flags = 0; // No special flags currently used |
380 | |
|
381 | 0 | perf_map_jit_write_fully(&header, sizeof(header)); |
382 | 0 | } |
383 | | |
384 | | // ============================================================================= |
385 | | // DWARF CONSTANTS AND UTILITIES |
386 | | // ============================================================================= |
387 | | |
388 | | /* |
389 | | * DWARF (Debug With Arbitrary Record Formats) constants |
390 | | * |
391 | | * DWARF is a debugging data format used to provide stack unwinding information. |
392 | | * These constants define the various encoding types and opcodes used in |
393 | | * DWARF Call Frame Information (CFI) records. |
394 | | */ |
395 | | |
396 | | /* DWARF Call Frame Information version */ |
397 | | #define DWRF_CIE_VERSION 1 |
398 | | |
399 | | /* DWARF CFA (Call Frame Address) opcodes */ |
400 | | enum { |
401 | | DWRF_CFA_nop = 0x0, // No operation |
402 | | DWRF_CFA_offset_extended = 0x5, // Extended offset instruction |
403 | | DWRF_CFA_def_cfa = 0xc, // Define CFA rule |
404 | | DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset |
405 | | DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset |
406 | | DWRF_CFA_advance_loc = 0x40, // Advance location counter |
407 | | DWRF_CFA_offset = 0x80 // Simple offset instruction |
408 | | }; |
409 | | |
410 | | /* DWARF Exception Handling pointer encodings */ |
411 | | enum { |
412 | | DWRF_EH_PE_absptr = 0x00, // Absolute pointer |
413 | | DWRF_EH_PE_omit = 0xff, // Omitted value |
414 | | |
415 | | /* Data type encodings */ |
416 | | DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 |
417 | | DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte |
418 | | DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte |
419 | | DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte |
420 | | DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 |
421 | | DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte |
422 | | DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte |
423 | | DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte |
424 | | DWRF_EH_PE_signed = 0x08, // Signed flag |
425 | | |
426 | | /* Reference type encodings */ |
427 | | DWRF_EH_PE_pcrel = 0x10, // PC-relative |
428 | | DWRF_EH_PE_textrel = 0x20, // Text-relative |
429 | | DWRF_EH_PE_datarel = 0x30, // Data-relative |
430 | | DWRF_EH_PE_funcrel = 0x40, // Function-relative |
431 | | DWRF_EH_PE_aligned = 0x50, // Aligned |
432 | | DWRF_EH_PE_indirect = 0x80 // Indirect |
433 | | }; |
434 | | |
435 | | /* Additional DWARF constants for debug information */ |
436 | | enum { DWRF_TAG_compile_unit = 0x11 }; |
437 | | enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; |
438 | | enum { |
439 | | DWRF_AT_name = 0x03, // Name attribute |
440 | | DWRF_AT_stmt_list = 0x10, // Statement list |
441 | | DWRF_AT_low_pc = 0x11, // Low PC address |
442 | | DWRF_AT_high_pc = 0x12 // High PC address |
443 | | }; |
444 | | enum { |
445 | | DWRF_FORM_addr = 0x01, // Address form |
446 | | DWRF_FORM_data4 = 0x06, // 4-byte data |
447 | | DWRF_FORM_string = 0x08 // String form |
448 | | }; |
449 | | |
450 | | /* Line number program opcodes */ |
451 | | enum { |
452 | | DWRF_LNS_extended_op = 0, // Extended opcode |
453 | | DWRF_LNS_copy = 1, // Copy operation |
454 | | DWRF_LNS_advance_pc = 2, // Advance program counter |
455 | | DWRF_LNS_advance_line = 3 // Advance line number |
456 | | }; |
457 | | |
458 | | /* Line number extended opcodes */ |
459 | | enum { |
460 | | DWRF_LNE_end_sequence = 1, // End of sequence |
461 | | DWRF_LNE_set_address = 2 // Set address |
462 | | }; |
463 | | |
464 | | /* |
465 | | * Architecture-specific DWARF register numbers |
466 | | * |
467 | | * These constants define the register numbering scheme used by DWARF |
468 | | * for each supported architecture. The numbers must match the ABI |
469 | | * specification for proper stack unwinding. |
470 | | */ |
471 | | enum { |
472 | | #ifdef __x86_64__ |
473 | | /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ |
474 | | DWRF_REG_AX, // RAX |
475 | | DWRF_REG_DX, // RDX |
476 | | DWRF_REG_CX, // RCX |
477 | | DWRF_REG_BX, // RBX |
478 | | DWRF_REG_SI, // RSI |
479 | | DWRF_REG_DI, // RDI |
480 | | DWRF_REG_BP, // RBP |
481 | | DWRF_REG_SP, // RSP |
482 | | DWRF_REG_8, // R8 |
483 | | DWRF_REG_9, // R9 |
484 | | DWRF_REG_10, // R10 |
485 | | DWRF_REG_11, // R11 |
486 | | DWRF_REG_12, // R12 |
487 | | DWRF_REG_13, // R13 |
488 | | DWRF_REG_14, // R14 |
489 | | DWRF_REG_15, // R15 |
490 | | DWRF_REG_RA, // Return address (RIP) |
491 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
492 | | /* AArch64 register numbering */ |
493 | | DWRF_REG_FP = 29, // Frame Pointer |
494 | | DWRF_REG_RA = 30, // Link register (return address) |
495 | | DWRF_REG_SP = 31, // Stack pointer |
496 | | #else |
497 | | # error "Unsupported target architecture" |
498 | | #endif |
499 | | }; |
500 | | |
501 | | /* DWARF encoding constants used in EH frame headers */ |
502 | | static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data |
503 | | static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data |
504 | | static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding |
505 | | static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding |
506 | | |
507 | | // ============================================================================= |
508 | | // ELF OBJECT CONTEXT |
509 | | // ============================================================================= |
510 | | |
511 | | /* |
512 | | * Context for building ELF/DWARF structures |
513 | | * |
514 | | * This structure maintains state while constructing DWARF unwind information. |
515 | | * It acts as a simple buffer manager with pointers to track current position |
516 | | * and important landmarks within the buffer. |
517 | | */ |
518 | | typedef struct ELFObjectContext { |
519 | | uint8_t* p; // Current write position in buffer |
520 | | uint8_t* startp; // Start of buffer (for offset calculations) |
521 | | uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) |
522 | | uint32_t code_size; // Size of the code being described |
523 | | } ELFObjectContext; |
524 | | |
525 | | /* |
526 | | * EH Frame Header structure for DWARF unwinding |
527 | | * |
528 | | * This structure provides metadata about the DWARF unwinding information |
529 | | * that follows. It's required by the perf jitdump format to enable proper |
530 | | * stack unwinding during profiling. |
531 | | */ |
532 | | typedef struct { |
533 | | unsigned char version; // EH frame version (always 1) |
534 | | unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer |
535 | | unsigned char fde_count_enc; // Encoding of FDE count |
536 | | unsigned char table_enc; // Encoding of table entries |
537 | | int32_t eh_frame_ptr; // Pointer to EH frame data |
538 | | int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) |
539 | | int32_t from; // Start address of code range |
540 | | int32_t to; // End address of code range |
541 | | } EhFrameHeader; |
542 | | |
543 | | // ============================================================================= |
544 | | // DWARF GENERATION UTILITIES |
545 | | // ============================================================================= |
546 | | |
547 | | /* |
548 | | * Append a null-terminated string to the ELF context buffer |
549 | | * |
550 | | * Args: |
551 | | * ctx: ELF object context |
552 | | * str: String to append (must be null-terminated) |
553 | | * |
554 | | * Returns: Offset from start of buffer where string was written |
555 | | */ |
556 | 0 | static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { |
557 | 0 | uint8_t* p = ctx->p; |
558 | 0 | uint32_t ofs = (uint32_t)(p - ctx->startp); |
559 | | |
560 | | /* Copy string including null terminator */ |
561 | 0 | do { |
562 | 0 | *p++ = (uint8_t)*str; |
563 | 0 | } while (*str++); |
564 | |
|
565 | 0 | ctx->p = p; |
566 | 0 | return ofs; |
567 | 0 | } |
568 | | |
569 | | /* |
570 | | * Append a SLEB128 (Signed Little Endian Base 128) value |
571 | | * |
572 | | * SLEB128 is a variable-length encoding used extensively in DWARF. |
573 | | * It efficiently encodes small numbers in fewer bytes. |
574 | | * |
575 | | * Args: |
576 | | * ctx: ELF object context |
577 | | * v: Signed value to encode |
578 | | */ |
579 | 0 | static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { |
580 | 0 | uint8_t* p = ctx->p; |
581 | | |
582 | | /* Encode 7 bits at a time, with continuation bit in MSB */ |
583 | 0 | for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { |
584 | 0 | *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit |
585 | 0 | } |
586 | 0 | *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit |
587 | |
|
588 | 0 | ctx->p = p; |
589 | 0 | } |
590 | | |
591 | | /* |
592 | | * Append a ULEB128 (Unsigned Little Endian Base 128) value |
593 | | * |
594 | | * Similar to SLEB128 but for unsigned values. |
595 | | * |
596 | | * Args: |
597 | | * ctx: ELF object context |
598 | | * v: Unsigned value to encode |
599 | | */ |
600 | 0 | static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { |
601 | 0 | uint8_t* p = ctx->p; |
602 | | |
603 | | /* Encode 7 bits at a time, with continuation bit in MSB */ |
604 | 0 | for (; v >= 0x80; v >>= 7) { |
605 | 0 | *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit |
606 | 0 | } |
607 | 0 | *p++ = (char)v; // Final byte without continuation bit |
608 | |
|
609 | 0 | ctx->p = p; |
610 | 0 | } |
611 | | |
612 | | /* |
613 | | * Macros for generating DWARF structures |
614 | | * |
615 | | * These macros provide a convenient way to write various data types |
616 | | * to the DWARF buffer while automatically advancing the pointer. |
617 | | */ |
618 | | #define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit |
619 | | #define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit |
620 | | #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit |
621 | | #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit |
622 | | #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address |
623 | | #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 |
624 | | #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 |
625 | | #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string |
626 | | |
627 | | /* Align to specified boundary with NOP instructions */ |
628 | | #define DWRF_ALIGNNOP(s) \ |
629 | | while ((uintptr_t)p & ((s)-1)) { \ |
630 | | *p++ = DWRF_CFA_nop; \ |
631 | | } |
632 | | |
633 | | /* Write a DWARF section with automatic size calculation */ |
634 | | #define DWRF_SECTION(name, stmt) \ |
635 | 0 | { \ |
636 | 0 | uint32_t* szp_##name = (uint32_t*)p; \ |
637 | 0 | p += 4; \ |
638 | 0 | stmt; \ |
639 | 0 | *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ |
640 | 0 | } |
641 | | |
642 | | // ============================================================================= |
643 | | // DWARF EH FRAME GENERATION |
644 | | // ============================================================================= |
645 | | |
646 | | /* |
647 | | * Initialize DWARF .eh_frame section for a code region |
648 | | * |
649 | | * The .eh_frame section contains Call Frame Information (CFI) that describes |
650 | | * how to unwind the stack at any point in the code. This is essential for |
651 | | * proper profiling as it allows perf to generate accurate call graphs. |
652 | | * |
653 | | * The function generates two main components: |
654 | | * 1. CIE (Common Information Entry) - describes calling conventions |
655 | | * 2. FDE (Frame Description Entry) - describes specific function unwinding |
656 | | * |
657 | | * Args: |
658 | | * ctx: ELF object context containing code size and buffer pointers |
659 | | */ |
660 | 0 | static void elf_init_ehframe(ELFObjectContext* ctx) { |
661 | 0 | uint8_t* p = ctx->p; |
662 | 0 | uint8_t* framep = p; // Remember start of frame data |
663 | | |
664 | | /* |
665 | | * DWARF Unwind Table for Trampoline Function |
666 | | * |
667 | | * This section defines DWARF Call Frame Information (CFI) using encoded macros |
668 | | * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function |
669 | | * preserves and restores registers. This is used by profiling tools (e.g., `perf`) |
670 | | * and debuggers for stack unwinding in JIT-compiled code. |
671 | | * |
672 | | * ------------------------------------------------- |
673 | | * TO REGENERATE THIS TABLE FROM GCC OBJECTS: |
674 | | * ------------------------------------------------- |
675 | | * |
676 | | * 1. Create a trampoline source file (e.g., `trampoline.c`): |
677 | | * |
678 | | * #include <Python.h> |
679 | | * typedef PyObject* (*py_evaluator)(void*, void*, int); |
680 | | * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { |
681 | | * return evaluator(ts, f, throwflag); |
682 | | * } |
683 | | * |
684 | | * 2. Compile to an object file with frame pointer preservation: |
685 | | * |
686 | | * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
687 | | * |
688 | | * 3. Extract DWARF unwind info from the object file: |
689 | | * |
690 | | * readelf -w trampoline.o |
691 | | * |
692 | | * Example output from `.eh_frame`: |
693 | | * |
694 | | * 00000000 CIE |
695 | | * Version: 1 |
696 | | * Augmentation: "zR" |
697 | | * Code alignment factor: 4 |
698 | | * Data alignment factor: -8 |
699 | | * Return address column: 30 |
700 | | * DW_CFA_def_cfa: r31 (sp) ofs 0 |
701 | | * |
702 | | * 00000014 FDE cie=00000000 pc=0..14 |
703 | | * DW_CFA_advance_loc: 4 |
704 | | * DW_CFA_def_cfa_offset: 16 |
705 | | * DW_CFA_offset: r29 at cfa-16 |
706 | | * DW_CFA_offset: r30 at cfa-8 |
707 | | * DW_CFA_advance_loc: 12 |
708 | | * DW_CFA_restore: r30 |
709 | | * DW_CFA_restore: r29 |
710 | | * DW_CFA_def_cfa_offset: 0 |
711 | | * |
712 | | * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. |
713 | | * |
714 | | * ---------------------------------- |
715 | | * HOW TO TRANSLATE TO DWRF_* MACROS: |
716 | | * ---------------------------------- |
717 | | * |
718 | | * After compiling your trampoline with: |
719 | | * |
720 | | * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
721 | | * |
722 | | * run: |
723 | | * |
724 | | * readelf -w trampoline.o |
725 | | * |
726 | | * to inspect the generated `.eh_frame` data. You will see two main components: |
727 | | * |
728 | | * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. |
729 | | * 2. An FDE (Frame Description Entry): function-specific unwind instructions. |
730 | | * |
731 | | * --------------------- |
732 | | * Translating the CIE: |
733 | | * --------------------- |
734 | | * From `readelf -w`, you might see: |
735 | | * |
736 | | * 00000000 0000000000000010 00000000 CIE |
737 | | * Version: 1 |
738 | | * Augmentation: "zR" |
739 | | * Code alignment factor: 4 |
740 | | * Data alignment factor: -8 |
741 | | * Return address column: 30 |
742 | | * Augmentation data: 1b |
743 | | * DW_CFA_def_cfa: r31 (sp) ofs 0 |
744 | | * |
745 | | * Map this to: |
746 | | * |
747 | | * DWRF_SECTION(CIE, |
748 | | * DWRF_U32(0); // CIE ID (always 0 for CIEs) |
749 | | * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 |
750 | | * DWRF_STR("zR"); // Augmentation string "zR" |
751 | | * DWRF_UV(4); // Code alignment factor = 4 |
752 | | * DWRF_SV(-8); // Data alignment factor = -8 |
753 | | * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) |
754 | | * DWRF_UV(1); // Augmentation data length = 1 |
755 | | * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers |
756 | | * |
757 | | * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa |
758 | | * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) |
759 | | * DWRF_UV(0); // Offset = 0 |
760 | | * |
761 | | * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary |
762 | | * ) |
763 | | * |
764 | | * Notes: |
765 | | * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. |
766 | | * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. |
767 | | * |
768 | | * --------------------- |
769 | | * Translating the FDE: |
770 | | * --------------------- |
771 | | * From `readelf -w`: |
772 | | * |
773 | | * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 |
774 | | * DW_CFA_advance_loc: 4 |
775 | | * DW_CFA_def_cfa_offset: 16 |
776 | | * DW_CFA_offset: r29 at cfa-16 |
777 | | * DW_CFA_offset: r30 at cfa-8 |
778 | | * DW_CFA_advance_loc: 12 |
779 | | * DW_CFA_restore: r30 |
780 | | * DW_CFA_restore: r29 |
781 | | * DW_CFA_def_cfa_offset: 0 |
782 | | * |
783 | | * Map the FDE header and instructions to: |
784 | | * |
785 | | * DWRF_SECTION(FDE, |
786 | | * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) |
787 | | * DWRF_U32(-0x30); // Initial PC-relative location of the code |
788 | | * DWRF_U32(ctx->code_size); // Code range covered by this FDE |
789 | | * DWRF_U8(0); // Augmentation data length (none) |
790 | | * |
791 | | * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) |
792 | | * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 |
793 | | * DWRF_UV(16); |
794 | | * |
795 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) |
796 | | * DWRF_UV(2); // At offset 2 * 8 = 16 bytes |
797 | | * |
798 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) |
799 | | * DWRF_UV(1); // At offset 1 * 8 = 8 bytes |
800 | | * |
801 | | * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) |
802 | | * |
803 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 |
804 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 |
805 | | * |
806 | | * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP |
807 | | * DWRF_UV(0); |
808 | | * ) |
809 | | * |
810 | | * To regenerate: |
811 | | * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. |
812 | | * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as |
813 | | * the code is in a different address space every time. |
814 | | * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: |
815 | | * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) |
816 | | * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) |
817 | | * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset |
818 | | * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) |
819 | | * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. |
820 | | * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. |
821 | | */ |
822 | | |
823 | | /* |
824 | | * Emit DWARF EH CIE (Common Information Entry) |
825 | | * |
826 | | * The CIE describes the calling conventions and basic unwinding rules |
827 | | * that apply to all functions in this compilation unit. |
828 | | */ |
829 | 0 | DWRF_SECTION(CIE, |
830 | 0 | DWRF_U32(0); // CIE ID (0 indicates this is a CIE) |
831 | 0 | DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) |
832 | 0 | DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) |
833 | 0 | DWRF_UV(1); // Code alignment factor |
834 | 0 | DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) |
835 | 0 | DWRF_U8(DWRF_REG_RA); // Return address register number |
836 | 0 | DWRF_UV(1); // Augmentation data length |
837 | 0 | DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding |
838 | | |
839 | | /* Initial CFI instructions - describe default calling convention */ |
840 | 0 | DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) |
841 | 0 | DWRF_UV(DWRF_REG_SP); // CFA = SP register |
842 | 0 | DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size |
843 | 0 | DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved |
844 | 0 | DWRF_UV(1); // At offset 1 from CFA |
845 | |
|
846 | 0 | DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
847 | 0 | ) |
848 | |
|
849 | 0 | ctx->eh_frame_p = p; // Remember start of FDE data |
850 | | |
851 | | /* |
852 | | * Emit DWARF EH FDE (Frame Description Entry) |
853 | | * |
854 | | * The FDE describes unwinding information specific to this function. |
855 | | * It references the CIE and provides function-specific CFI instructions. |
856 | | */ |
857 | 0 | DWRF_SECTION(FDE, |
858 | 0 | DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) |
859 | 0 | DWRF_U32(-0x30); // Machine code offset relative to .text |
860 | 0 | DWRF_U32(ctx->code_size); // Address range covered by this FDE (code lenght) |
861 | 0 | DWRF_U8(0); // Augmentation data length (none) |
862 | | |
863 | | /* |
864 | | * Architecture-specific CFI instructions |
865 | | * |
866 | | * These instructions describe how registers are saved and restored |
867 | | * during function calls. Each architecture has different calling |
868 | | * conventions and register usage patterns. |
869 | | */ |
870 | 0 | #ifdef __x86_64__ |
871 | | /* x86_64 calling convention unwinding rules */ |
872 | | # if defined(__CET__) && (__CET__ & 1) |
873 | | DWRF_U8(DWRF_CFA_advance_loc | 8); // Advance location by 8 bytes when CET protection is enabled |
874 | | # else |
875 | 0 | DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance location by 4 bytes |
876 | 0 | # endif |
877 | 0 | DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset |
878 | 0 | DWRF_UV(16); // New offset: SP + 16 |
879 | 0 | DWRF_U8(DWRF_CFA_advance_loc | 6); // Advance location by 6 bytes |
880 | 0 | DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset |
881 | 0 | DWRF_UV(8); // New offset: SP + 8 |
882 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
883 | | /* AArch64 calling convention unwinding rules */ |
884 | | DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 instruction (stp x29, x30) |
885 | | DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset |
886 | | DWRF_UV(16); // CFA = SP + 16 (stack pointer after push) |
887 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Frame pointer (x29) saved |
888 | | DWRF_UV(2); // At offset 2 from CFA (2 * 8 = 16 bytes) |
889 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Link register (x30) saved |
890 | | DWRF_UV(1); // At offset 1 from CFA (1 * 8 = 8 bytes) |
891 | | DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...) |
892 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore frame pointer (x29) |
893 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore link register (x30) |
894 | | DWRF_U8(DWRF_CFA_def_cfa_offset); // Final CFA adjustment |
895 | | DWRF_UV(0); // CFA = SP + 0 (stack restored) |
896 | | |
897 | | #else |
898 | | # error "Unsupported target architecture" |
899 | | #endif |
900 | |
|
901 | 0 | DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
902 | 0 | ) |
903 | |
|
904 | 0 | ctx->p = p; // Update context pointer to end of generated data |
905 | 0 | } |
906 | | |
907 | | // ============================================================================= |
908 | | // JITDUMP INITIALIZATION |
909 | | // ============================================================================= |
910 | | |
911 | | /* |
912 | | * Initialize the perf jitdump interface |
913 | | * |
914 | | * This function sets up everything needed to generate jitdump files: |
915 | | * 1. Creates the jitdump file with a unique name |
916 | | * 2. Maps the first page to signal perf that we're using the interface |
917 | | * 3. Writes the jitdump header |
918 | | * 4. Initializes synchronization primitives |
919 | | * |
920 | | * The memory mapping is crucial - perf detects jitdump files by scanning |
921 | | * for processes that have mapped files matching the pattern /tmp/jit-*.dump |
922 | | * |
923 | | * Returns: Pointer to initialized state, or NULL on failure |
924 | | */ |
925 | 0 | static void* perf_map_jit_init(void) { |
926 | 0 | char filename[100]; |
927 | 0 | int pid = getpid(); |
928 | | |
929 | | /* Create unique filename based on process ID */ |
930 | 0 | snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
931 | | |
932 | | /* Create/open the jitdump file with appropriate permissions */ |
933 | 0 | const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
934 | 0 | if (fd == -1) { |
935 | 0 | return NULL; // Failed to create file |
936 | 0 | } |
937 | | |
938 | | /* Get system page size for memory mapping */ |
939 | 0 | const long page_size = sysconf(_SC_PAGESIZE); |
940 | 0 | if (page_size == -1) { |
941 | 0 | close(fd); |
942 | 0 | return NULL; // Failed to get page size |
943 | 0 | } |
944 | | |
945 | | /* |
946 | | * Map the first page of the jitdump file |
947 | | * |
948 | | * This memory mapping serves as a signal to perf that this process |
949 | | * is generating JIT code. Perf scans /proc/.../maps looking for mapped |
950 | | * files that match the jitdump naming pattern. |
951 | | * |
952 | | * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. |
953 | | */ |
954 | 0 | perf_jit_map_state.mapped_buffer = mmap( |
955 | 0 | NULL, // Let kernel choose address |
956 | 0 | page_size, // Map one page |
957 | 0 | PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) |
958 | 0 | MAP_PRIVATE, // Private mapping |
959 | 0 | fd, // File descriptor |
960 | 0 | 0 // Offset 0 (first page) |
961 | 0 | ); |
962 | |
|
963 | 0 | if (perf_jit_map_state.mapped_buffer == NULL) { |
964 | 0 | close(fd); |
965 | 0 | return NULL; // Memory mapping failed |
966 | 0 | } |
967 | | |
968 | 0 | perf_jit_map_state.mapped_size = page_size; |
969 | | |
970 | | /* Convert file descriptor to FILE* for easier I/O operations */ |
971 | 0 | perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
972 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
973 | 0 | close(fd); |
974 | 0 | return NULL; // Failed to create FILE* |
975 | 0 | } |
976 | | |
977 | | /* |
978 | | * Set up file buffering for better performance |
979 | | * |
980 | | * We use a large buffer (2MB) because jitdump files can be written |
981 | | * frequently during program execution. Buffering reduces system call |
982 | | * overhead and improves overall performance. |
983 | | */ |
984 | 0 | setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
985 | | |
986 | | /* Write the jitdump file header */ |
987 | 0 | perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
988 | | |
989 | | /* |
990 | | * Initialize thread synchronization lock |
991 | | * |
992 | | * Multiple threads may attempt to write to the jitdump file |
993 | | * simultaneously. This lock ensures thread-safe access to the |
994 | | * global jitdump state. |
995 | | */ |
996 | 0 | perf_jit_map_state.map_lock = PyThread_allocate_lock(); |
997 | 0 | if (perf_jit_map_state.map_lock == NULL) { |
998 | 0 | fclose(perf_jit_map_state.perf_map); |
999 | 0 | return NULL; // Failed to create lock |
1000 | 0 | } |
1001 | | |
1002 | | /* Initialize code ID counter */ |
1003 | 0 | perf_jit_map_state.code_id = 0; |
1004 | | |
1005 | | /* Configure trampoline API with padding information */ |
1006 | 0 | trampoline_api.code_padding = PERF_JIT_CODE_PADDING; |
1007 | |
|
1008 | 0 | return &perf_jit_map_state; |
1009 | 0 | } |
1010 | | |
1011 | | // ============================================================================= |
1012 | | // MAIN JITDUMP ENTRY WRITING |
1013 | | // ============================================================================= |
1014 | | |
1015 | | /* |
1016 | | * Write a complete jitdump entry for a Python function |
1017 | | * |
1018 | | * This is the main function called by Python's trampoline system whenever |
1019 | | * a new piece of JIT-compiled code needs to be recorded. It writes both |
1020 | | * the unwinding information and the code load event to the jitdump file. |
1021 | | * |
1022 | | * The function performs these steps: |
1023 | | * 1. Initialize jitdump system if not already done |
1024 | | * 2. Extract function name and filename from Python code object |
1025 | | * 3. Generate DWARF unwinding information |
1026 | | * 4. Write unwinding info event to jitdump file |
1027 | | * 5. Write code load event to jitdump file |
1028 | | * |
1029 | | * Args: |
1030 | | * state: Jitdump state (currently unused, uses global state) |
1031 | | * code_addr: Address where the compiled code resides |
1032 | | * code_size: Size of the compiled code in bytes |
1033 | | * co: Python code object containing metadata |
1034 | | * |
1035 | | * IMPORTANT: This function signature is part of Python's internal API |
1036 | | * and must not be changed without coordinating with core Python development. |
1037 | | */ |
1038 | | static void perf_map_jit_write_entry(void *state, const void *code_addr, |
1039 | | unsigned int code_size, PyCodeObject *co) |
1040 | 0 | { |
1041 | | /* Initialize jitdump system on first use */ |
1042 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
1043 | 0 | void* ret = perf_map_jit_init(); |
1044 | 0 | if(ret == NULL){ |
1045 | 0 | return; // Initialization failed, silently abort |
1046 | 0 | } |
1047 | 0 | } |
1048 | | |
1049 | | /* |
1050 | | * Extract function information from Python code object |
1051 | | * |
1052 | | * We create a human-readable function name by combining the qualified |
1053 | | * name (includes class/module context) with the filename. This helps |
1054 | | * developers identify functions in perf reports. |
1055 | | */ |
1056 | 0 | const char *entry = ""; |
1057 | 0 | if (co->co_qualname != NULL) { |
1058 | 0 | entry = PyUnicode_AsUTF8(co->co_qualname); |
1059 | 0 | } |
1060 | |
|
1061 | 0 | const char *filename = ""; |
1062 | 0 | if (co->co_filename != NULL) { |
1063 | 0 | filename = PyUnicode_AsUTF8(co->co_filename); |
1064 | 0 | } |
1065 | | |
1066 | | /* |
1067 | | * Create formatted function name for perf display |
1068 | | * |
1069 | | * Format: "py::<function_name>:<filename>" |
1070 | | * The "py::" prefix helps identify Python functions in mixed-language |
1071 | | * profiles (e.g., when profiling C extensions alongside Python code). |
1072 | | */ |
1073 | 0 | size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
1074 | 0 | char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
1075 | 0 | if (perf_map_entry == NULL) { |
1076 | 0 | return; // Memory allocation failed |
1077 | 0 | } |
1078 | 0 | snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
1079 | |
|
1080 | 0 | const size_t name_length = strlen(perf_map_entry); |
1081 | 0 | uword base = (uword)code_addr; |
1082 | 0 | uword size = code_size; |
1083 | | |
1084 | | /* |
1085 | | * Generate DWARF unwinding information |
1086 | | * |
1087 | | * DWARF data is essential for proper stack unwinding during profiling. |
1088 | | * Without it, perf cannot generate accurate call graphs, especially |
1089 | | * in optimized code where frame pointers may be omitted. |
1090 | | */ |
1091 | 0 | ELFObjectContext ctx; |
1092 | 0 | char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
1093 | 0 | ctx.code_size = code_size; |
1094 | 0 | ctx.startp = ctx.p = (uint8_t*)buffer; |
1095 | | |
1096 | | /* Generate EH frame (Exception Handling frame) data */ |
1097 | 0 | elf_init_ehframe(&ctx); |
1098 | 0 | int eh_frame_size = ctx.p - ctx.startp; |
1099 | | |
1100 | | /* |
1101 | | * Write Code Unwinding Information Event |
1102 | | * |
1103 | | * This event must be written before the code load event to ensure |
1104 | | * perf has the unwinding information available when it processes |
1105 | | * the code region. |
1106 | | */ |
1107 | 0 | CodeUnwindingInfoEvent ev2; |
1108 | 0 | ev2.base.event = PerfUnwindingInfo; |
1109 | 0 | ev2.base.time_stamp = get_current_monotonic_ticks(); |
1110 | 0 | ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
1111 | | |
1112 | | /* Verify we don't exceed our padding budget */ |
1113 | 0 | assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING); |
1114 | |
|
1115 | 0 | ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
1116 | 0 | ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment |
1117 | | |
1118 | | /* Calculate total event size with padding */ |
1119 | 0 | int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; |
1120 | 0 | int padding_size = round_up(content_size, 8) - content_size; // 8-byte align |
1121 | 0 | ev2.base.size = content_size + padding_size; |
1122 | | |
1123 | | /* Write the unwinding info event header */ |
1124 | 0 | perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
1125 | | |
1126 | | /* |
1127 | | * Write EH Frame Header |
1128 | | * |
1129 | | * The EH frame header provides metadata about the DWARF unwinding |
1130 | | * information that follows. It includes pointers and counts that |
1131 | | * help perf navigate the unwinding data efficiently. |
1132 | | */ |
1133 | 0 | EhFrameHeader f; |
1134 | 0 | f.version = 1; |
1135 | 0 | f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte |
1136 | 0 | f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count |
1137 | 0 | f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte |
1138 | | |
1139 | | /* Calculate relative offsets for EH frame navigation */ |
1140 | 0 | f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); |
1141 | 0 | f.eh_fde_count = 1; // We generate exactly one FDE per function |
1142 | 0 | f.from = -(round_up(code_size, 8) + eh_frame_size); |
1143 | |
|
1144 | 0 | int cie_size = ctx.eh_frame_p - ctx.startp; |
1145 | 0 | f.to = -(eh_frame_size - cie_size); |
1146 | | |
1147 | | /* Write EH frame data and header */ |
1148 | 0 | perf_map_jit_write_fully(ctx.startp, eh_frame_size); |
1149 | 0 | perf_map_jit_write_fully(&f, sizeof(f)); |
1150 | | |
1151 | | /* Write padding to maintain alignment */ |
1152 | 0 | char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
1153 | 0 | perf_map_jit_write_fully(&padding_bytes, padding_size); |
1154 | | |
1155 | | /* |
1156 | | * Write Code Load Event |
1157 | | * |
1158 | | * This event tells perf about the new code region. It includes: |
1159 | | * - Memory addresses and sizes |
1160 | | * - Process and thread identification |
1161 | | * - Function name for symbol resolution |
1162 | | * - The actual machine code bytes |
1163 | | */ |
1164 | 0 | CodeLoadEvent ev; |
1165 | 0 | ev.base.event = PerfLoad; |
1166 | 0 | ev.base.size = sizeof(ev) + (name_length+1) + size; |
1167 | 0 | ev.base.time_stamp = get_current_monotonic_ticks(); |
1168 | 0 | ev.process_id = getpid(); |
1169 | 0 | ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call |
1170 | 0 | ev.vma = base; // Virtual memory address |
1171 | 0 | ev.code_address = base; // Same as VMA for our use case |
1172 | 0 | ev.code_size = size; |
1173 | | |
1174 | | /* Assign unique code ID and increment counter */ |
1175 | 0 | perf_jit_map_state.code_id += 1; |
1176 | 0 | ev.code_id = perf_jit_map_state.code_id; |
1177 | | |
1178 | | /* Write code load event and associated data */ |
1179 | 0 | perf_map_jit_write_fully(&ev, sizeof(ev)); |
1180 | 0 | perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator |
1181 | 0 | perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code |
1182 | | |
1183 | | /* Clean up allocated memory */ |
1184 | 0 | PyMem_RawFree(perf_map_entry); |
1185 | 0 | } |
1186 | | |
1187 | | // ============================================================================= |
1188 | | // CLEANUP AND FINALIZATION |
1189 | | // ============================================================================= |
1190 | | |
1191 | | /* |
1192 | | * Finalize and cleanup the perf jitdump system |
1193 | | * |
1194 | | * This function is called when Python is shutting down or when the |
1195 | | * perf trampoline system is being disabled. It ensures all resources |
1196 | | * are properly released and all buffered data is flushed to disk. |
1197 | | * |
1198 | | * Args: |
1199 | | * state: Jitdump state (currently unused, uses global state) |
1200 | | * |
1201 | | * Returns: 0 on success |
1202 | | * |
1203 | | * IMPORTANT: This function signature is part of Python's internal API |
1204 | | * and must not be changed without coordinating with core Python development. |
1205 | | */ |
1206 | 0 | static int perf_map_jit_fini(void* state) { |
1207 | | /* |
1208 | | * Close jitdump file with proper synchronization |
1209 | | * |
1210 | | * We need to acquire the lock to ensure no other threads are |
1211 | | * writing to the file when we close it. This prevents corruption |
1212 | | * and ensures all data is properly flushed. |
1213 | | */ |
1214 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
1215 | 0 | PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); |
1216 | 0 | fclose(perf_jit_map_state.perf_map); // This also flushes buffers |
1217 | 0 | PyThread_release_lock(perf_jit_map_state.map_lock); |
1218 | | |
1219 | | /* Clean up synchronization primitive */ |
1220 | 0 | PyThread_free_lock(perf_jit_map_state.map_lock); |
1221 | 0 | perf_jit_map_state.perf_map = NULL; |
1222 | 0 | } |
1223 | | |
1224 | | /* |
1225 | | * Unmap the memory region |
1226 | | * |
1227 | | * This removes the signal to perf that we were generating JIT code. |
1228 | | * After this point, perf will no longer detect this process as |
1229 | | * having JIT capabilities. |
1230 | | */ |
1231 | 0 | if (perf_jit_map_state.mapped_buffer != NULL) { |
1232 | 0 | munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
1233 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
1234 | 0 | } |
1235 | | |
1236 | | /* Clear global state reference */ |
1237 | 0 | trampoline_api.state = NULL; |
1238 | |
|
1239 | 0 | return 0; // Success |
1240 | 0 | } |
1241 | | |
1242 | | // ============================================================================= |
1243 | | // PUBLIC API EXPORT |
1244 | | // ============================================================================= |
1245 | | |
1246 | | /* |
1247 | | * Python Perf Callbacks Structure |
1248 | | * |
1249 | | * This structure defines the callback interface that Python's trampoline |
1250 | | * system uses to integrate with perf profiling. It contains function |
1251 | | * pointers for initialization, event writing, and cleanup. |
1252 | | * |
1253 | | * CRITICAL: This structure and its contents are part of Python's internal |
1254 | | * API. The function signatures and behavior must remain stable to maintain |
1255 | | * compatibility with the Python interpreter's perf integration system. |
1256 | | * |
1257 | | * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h |
1258 | | */ |
1259 | | _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
1260 | | &perf_map_jit_init, // Initialization function |
1261 | | &perf_map_jit_write_entry, // Event writing function |
1262 | | &perf_map_jit_fini, // Cleanup function |
1263 | | }; |
1264 | | |
1265 | | #endif /* PY_HAVE_PERF_TRAMPOLINE */ |