/src/cpython/Python/perf_jit_trampoline.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Python Perf Trampoline Support - JIT Dump Implementation |
3 | | * |
4 | | * This file implements the perf jitdump API for Python's performance profiling |
5 | | * integration. It allows perf (Linux performance analysis tool) to understand |
6 | | * and profile dynamically generated Python bytecode by creating JIT dump files |
7 | | * that perf can inject into its analysis. |
8 | | * |
9 | | * |
10 | | * IMPORTANT: This file exports specific callback functions that are part of |
11 | | * Python's internal API. Do not modify the function signatures or behavior |
12 | | * of exported functions without coordinating with the Python core team. |
13 | | * |
14 | | * Usually the binary and libraries are mapped in separate region like below: |
15 | | * |
16 | | * address -> |
17 | | * --+---------------------+--//--+---------------------+-- |
18 | | * | .text | .data | ... | | .text | .data | ... | |
19 | | * --+---------------------+--//--+---------------------+-- |
20 | | * myprog libc.so |
21 | | * |
22 | | * So it'd be easy and straight-forward to find a mapped binary or library from an |
23 | | * address. |
24 | | * |
25 | | * But for JIT code, the code arena only cares about the code section. But the |
26 | | * resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
27 | | * unwind info too. Then it'd generate following address space with synthesized |
28 | | * MMAP events. Let's say it has a sample between address B and C. |
29 | | * |
30 | | * sample |
31 | | * | |
32 | | * address -> A B v C |
33 | | * --------------------------------------------------------------------------------------------------- |
34 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
35 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
36 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
37 | | * ... |
38 | | * --------------------------------------------------------------------------------------------------- |
39 | | * |
40 | | * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
41 | | * the unwind info. If it maps both .text section and unwind sections, the sample |
42 | | * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
43 | | * which one is right. So to make perf happy we have non-overlapping ranges for each |
44 | | * DSO: |
45 | | * |
46 | | * address -> |
47 | | * ------------------------------------------------------------------------------------------------------- |
48 | | * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
49 | | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
50 | | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
51 | | * ... |
52 | | * ------------------------------------------------------------------------------------------------------- |
53 | | * |
54 | | * As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
55 | | * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
56 | | */ |
57 | | |
58 | | |
59 | | |
60 | | #include "Python.h" |
61 | | #include "pycore_ceval.h" // _PyPerf_Callbacks |
62 | | #include "pycore_frame.h" |
63 | | #include "pycore_interp.h" |
64 | | #include "pycore_runtime.h" // _PyRuntime |
65 | | |
66 | | #ifdef PY_HAVE_PERF_TRAMPOLINE |
67 | | |
68 | | /* Standard library includes for perf jitdump implementation */ |
69 | | #include <elf.h> // ELF architecture constants |
70 | | #include <fcntl.h> // File control operations |
71 | | #include <stdio.h> // Standard I/O operations |
72 | | #include <stdlib.h> // Standard library functions |
73 | | #include <sys/mman.h> // Memory mapping functions (mmap) |
74 | | #include <sys/types.h> // System data types |
75 | | #include <unistd.h> // System calls (sysconf, getpid) |
76 | | #include <sys/time.h> // Time functions (gettimeofday) |
77 | | #include <sys/syscall.h> // System call interface |
78 | | |
79 | | // ============================================================================= |
80 | | // CONSTANTS AND CONFIGURATION |
81 | | // ============================================================================= |
82 | | |
83 | | /* |
84 | | * Memory layout considerations for perf jitdump: |
85 | | * |
86 | | * Perf expects non-overlapping memory regions for each JIT-compiled function. |
87 | | * When perf processes the jitdump file, it creates synthetic DSO (Dynamic |
88 | | * Shared Object) files that contain: |
89 | | * - ELF headers |
90 | | * - .text section (actual machine code) |
91 | | * - Unwind information (for stack traces) |
92 | | * |
93 | | * To ensure proper address space layout, we add padding between code regions. |
94 | | * This prevents address conflicts when perf maps the synthesized DSOs. |
95 | | * |
96 | | * Memory layout example: |
97 | | * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] |
98 | | * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] |
99 | | * |
100 | | * The padding size is now calculated automatically during initialization |
101 | | * based on the actual unwind information requirements. |
102 | | */ |
103 | | |
104 | | /* Convenient access to the global trampoline API state */ |
105 | 0 | #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
106 | | |
107 | | /* Type aliases for clarity and portability */ |
108 | | typedef uint64_t uword; // Word-sized unsigned integer |
109 | | typedef const char* CodeComments; // Code comment strings |
110 | | |
111 | | /* Memory size constants */ |
112 | 0 | #define MB (1024 * 1024) // 1 Megabyte for buffer sizing |
113 | | |
114 | | // ============================================================================= |
115 | | // ARCHITECTURE-SPECIFIC DEFINITIONS |
116 | | // ============================================================================= |
117 | | |
118 | | /* |
119 | | * Returns the ELF machine architecture constant for the current platform. |
120 | | * This is required for the jitdump header to correctly identify the target |
121 | | * architecture for perf processing. |
122 | | * |
123 | | */ |
124 | 0 | static uint64_t GetElfMachineArchitecture(void) { |
125 | 0 | #if defined(__x86_64__) || defined(_M_X64) |
126 | 0 | return EM_X86_64; |
127 | | #elif defined(__i386__) || defined(_M_IX86) |
128 | | return EM_386; |
129 | | #elif defined(__aarch64__) |
130 | | return EM_AARCH64; |
131 | | #elif defined(__arm__) || defined(_M_ARM) |
132 | | return EM_ARM; |
133 | | #elif defined(__riscv) |
134 | | return EM_RISCV; |
135 | | #else |
136 | | Py_UNREACHABLE(); // Unsupported architecture - should never reach here |
137 | | return 0; |
138 | | #endif |
139 | 0 | } |
140 | | |
141 | | // ============================================================================= |
142 | | // PERF JITDUMP DATA STRUCTURES |
143 | | // ============================================================================= |
144 | | |
145 | | /* |
146 | | * Perf jitdump file format structures |
147 | | * |
148 | | * These structures define the binary format that perf expects for JIT dump files. |
149 | | * The format is documented in the Linux perf tools source code and must match |
150 | | * exactly for proper perf integration. |
151 | | */ |
152 | | |
153 | | /* |
154 | | * Jitdump file header - written once at the beginning of each jitdump file |
155 | | * Contains metadata about the process and jitdump format version |
156 | | */ |
157 | | typedef struct { |
158 | | uint32_t magic; // Magic number (0x4A695444 = "JiTD") |
159 | | uint32_t version; // Jitdump format version (currently 1) |
160 | | uint32_t size; // Size of this header structure |
161 | | uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) |
162 | | uint32_t reserved; // Reserved field (must be 0) |
163 | | uint32_t process_id; // Process ID of the JIT compiler |
164 | | uint64_t time_stamp; // Timestamp when jitdump was created |
165 | | uint64_t flags; // Feature flags (currently unused) |
166 | | } Header; |
167 | | |
168 | | /* |
169 | | * Perf event types supported by the jitdump format |
170 | | * Each event type has a corresponding structure format |
171 | | */ |
172 | | enum PerfEvent { |
173 | | PerfLoad = 0, // Code load event (new JIT function) |
174 | | PerfMove = 1, // Code move event (function relocated) |
175 | | PerfDebugInfo = 2, // Debug information event |
176 | | PerfClose = 3, // JIT session close event |
177 | | PerfUnwindingInfo = 4 // Stack unwinding information event |
178 | | }; |
179 | | |
180 | | /* |
181 | | * Base event structure - common header for all perf events |
182 | | * Every event in the jitdump file starts with this structure |
183 | | */ |
184 | | struct BaseEvent { |
185 | | uint32_t event; // Event type (from PerfEvent enum) |
186 | | uint32_t size; // Total size of this event including payload |
187 | | uint64_t time_stamp; // Timestamp when event occurred |
188 | | }; |
189 | | |
190 | | /* |
191 | | * Code load event - indicates a new JIT-compiled function is available |
192 | | * This is the most important event type for Python profiling |
193 | | */ |
194 | | typedef struct { |
195 | | struct BaseEvent base; // Common event header |
196 | | uint32_t process_id; // Process ID where code was generated |
197 | | uint32_t thread_id; // Thread ID where code was generated |
198 | | uint64_t vma; // Virtual memory address where code is loaded |
199 | | uint64_t code_address; // Address of the actual machine code |
200 | | uint64_t code_size; // Size of the machine code in bytes |
201 | | uint64_t code_id; // Unique identifier for this code region |
202 | | /* Followed by: |
203 | | * - null-terminated function name string |
204 | | * - raw machine code bytes |
205 | | */ |
206 | | } CodeLoadEvent; |
207 | | |
208 | | /* |
209 | | * Code unwinding information event - provides DWARF data for stack traces |
210 | | * Essential for proper stack unwinding during profiling |
211 | | */ |
212 | | typedef struct { |
213 | | struct BaseEvent base; // Common event header |
214 | | uint64_t unwind_data_size; // Size of the unwinding data |
215 | | uint64_t eh_frame_hdr_size; // Size of the EH frame header |
216 | | uint64_t mapped_size; // Total mapped size (with padding) |
217 | | /* Followed by: |
218 | | * - EH frame header |
219 | | * - DWARF unwinding information |
220 | | * - Padding to alignment boundary |
221 | | */ |
222 | | } CodeUnwindingInfoEvent; |
223 | | |
224 | | // ============================================================================= |
225 | | // GLOBAL STATE MANAGEMENT |
226 | | // ============================================================================= |
227 | | |
228 | | /* |
229 | | * Global state for the perf jitdump implementation |
230 | | * |
231 | | * This structure maintains all the state needed for generating jitdump files. |
232 | | * It's designed as a singleton since there's typically only one jitdump file |
233 | | * per Python process. |
234 | | */ |
235 | | typedef struct { |
236 | | FILE* perf_map; // File handle for the jitdump file |
237 | | PyThread_type_lock map_lock; // Thread synchronization lock |
238 | | void* mapped_buffer; // Memory-mapped region (signals perf we're active) |
239 | | size_t mapped_size; // Size of the mapped region |
240 | | int code_id; // Counter for unique code region identifiers |
241 | | } PerfMapJitState; |
242 | | |
243 | | /* Global singleton instance */ |
244 | | static PerfMapJitState perf_jit_map_state; |
245 | | |
246 | | // ============================================================================= |
247 | | // TIME UTILITIES |
248 | | // ============================================================================= |
249 | | |
250 | | /* Time conversion constant */ |
251 | | static const intptr_t nanoseconds_per_second = 1000000000; |
252 | | |
253 | | /* |
254 | | * Get current monotonic time in nanoseconds |
255 | | * |
256 | | * Monotonic time is preferred for event timestamps because it's not affected |
257 | | * by system clock adjustments. This ensures consistent timing relationships |
258 | | * between events even if the system clock is changed. |
259 | | * |
260 | | * Returns: Current monotonic time in nanoseconds since an arbitrary epoch |
261 | | */ |
262 | 0 | static int64_t get_current_monotonic_ticks(void) { |
263 | 0 | struct timespec ts; |
264 | 0 | if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
265 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
266 | 0 | return 0; |
267 | 0 | } |
268 | | |
269 | | /* Convert to nanoseconds for maximum precision */ |
270 | 0 | int64_t result = ts.tv_sec; |
271 | 0 | result *= nanoseconds_per_second; |
272 | 0 | result += ts.tv_nsec; |
273 | 0 | return result; |
274 | 0 | } |
275 | | |
276 | | /* |
277 | | * Get current wall clock time in microseconds |
278 | | * |
279 | | * Used for the jitdump file header timestamp. Unlike monotonic time, |
280 | | * this represents actual wall clock time that can be correlated with |
281 | | * other system events. |
282 | | * |
283 | | * Returns: Current time in microseconds since Unix epoch |
284 | | */ |
285 | 0 | static int64_t get_current_time_microseconds(void) { |
286 | 0 | struct timeval tv; |
287 | 0 | if (gettimeofday(&tv, NULL) < 0) { |
288 | 0 | Py_UNREACHABLE(); // Should never fail on supported systems |
289 | 0 | return 0; |
290 | 0 | } |
291 | 0 | return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
292 | 0 | } |
293 | | |
294 | | // ============================================================================= |
295 | | // UTILITY FUNCTIONS |
296 | | // ============================================================================= |
297 | | |
298 | | /* |
299 | | * Round up a value to the next multiple of a given number |
300 | | * |
301 | | * This is essential for maintaining proper alignment requirements in the |
302 | | * jitdump format. Many structures need to be aligned to specific boundaries |
303 | | * (typically 8 or 16 bytes) for efficient processing by perf. |
304 | | * |
305 | | * Args: |
306 | | * value: The value to round up |
307 | | * multiple: The multiple to round up to |
308 | | * |
309 | | * Returns: The smallest value >= input that is a multiple of 'multiple' |
310 | | */ |
311 | 0 | static size_t round_up(int64_t value, int64_t multiple) { |
312 | 0 | if (multiple == 0) { |
313 | 0 | return value; // Avoid division by zero |
314 | 0 | } |
315 | | |
316 | 0 | int64_t remainder = value % multiple; |
317 | 0 | if (remainder == 0) { |
318 | 0 | return value; // Already aligned |
319 | 0 | } |
320 | | |
321 | | /* Calculate how much to add to reach the next multiple */ |
322 | 0 | int64_t difference = multiple - remainder; |
323 | 0 | int64_t rounded_up_value = value + difference; |
324 | |
|
325 | 0 | return rounded_up_value; |
326 | 0 | } |
327 | | |
328 | | // ============================================================================= |
329 | | // FILE I/O UTILITIES |
330 | | // ============================================================================= |
331 | | |
332 | | /* |
333 | | * Write data to the jitdump file with error handling |
334 | | * |
335 | | * This function ensures that all data is written to the file, handling |
336 | | * partial writes that can occur with large buffers or when the system |
337 | | * is under load. |
338 | | * |
339 | | * Args: |
340 | | * buffer: Pointer to data to write |
341 | | * size: Number of bytes to write |
342 | | */ |
343 | 0 | static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
344 | 0 | FILE* out_file = perf_jit_map_state.perf_map; |
345 | 0 | const char* ptr = (const char*)(buffer); |
346 | |
|
347 | 0 | while (size > 0) { |
348 | 0 | const size_t written = fwrite(ptr, 1, size, out_file); |
349 | 0 | if (written == 0) { |
350 | 0 | Py_UNREACHABLE(); // Write failure - should be very rare |
351 | 0 | break; |
352 | 0 | } |
353 | 0 | size -= written; |
354 | 0 | ptr += written; |
355 | 0 | } |
356 | 0 | } |
357 | | |
358 | | /* |
359 | | * Write the jitdump file header |
360 | | * |
361 | | * The header must be written exactly once at the beginning of each jitdump |
362 | | * file. It provides metadata that perf uses to parse the rest of the file. |
363 | | * |
364 | | * Args: |
365 | | * pid: Process ID to include in the header |
366 | | * out_file: File handle to write to (currently unused, uses global state) |
367 | | */ |
368 | 0 | static void perf_map_jit_write_header(int pid, FILE* out_file) { |
369 | 0 | Header header; |
370 | | |
371 | | /* Initialize header with required values */ |
372 | 0 | header.magic = 0x4A695444; // "JiTD" magic number |
373 | 0 | header.version = 1; // Current jitdump version |
374 | 0 | header.size = sizeof(Header); // Header size for validation |
375 | 0 | header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture |
376 | 0 | header.process_id = pid; // Process identifier |
377 | 0 | header.time_stamp = get_current_time_microseconds(); // Creation time |
378 | 0 | header.flags = 0; // No special flags currently used |
379 | |
|
380 | 0 | perf_map_jit_write_fully(&header, sizeof(header)); |
381 | 0 | } |
382 | | |
383 | | // ============================================================================= |
384 | | // DWARF CONSTANTS AND UTILITIES |
385 | | // ============================================================================= |
386 | | |
387 | | /* |
388 | | * DWARF (Debug With Arbitrary Record Formats) constants |
389 | | * |
390 | | * DWARF is a debugging data format used to provide stack unwinding information. |
391 | | * These constants define the various encoding types and opcodes used in |
392 | | * DWARF Call Frame Information (CFI) records. |
393 | | */ |
394 | | |
395 | | /* DWARF Call Frame Information version */ |
396 | | #define DWRF_CIE_VERSION 1 |
397 | | |
398 | | /* DWARF CFA (Call Frame Address) opcodes */ |
399 | | enum { |
400 | | DWRF_CFA_nop = 0x0, // No operation |
401 | | DWRF_CFA_offset_extended = 0x5, // Extended offset instruction |
402 | | DWRF_CFA_def_cfa = 0xc, // Define CFA rule |
403 | | DWRF_CFA_def_cfa_register = 0xd, // Define CFA register |
404 | | DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset |
405 | | DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset |
406 | | DWRF_CFA_advance_loc = 0x40, // Advance location counter |
407 | | DWRF_CFA_offset = 0x80, // Simple offset instruction |
408 | | DWRF_CFA_restore = 0xc0 // Restore register |
409 | | }; |
410 | | |
411 | | /* DWARF Exception Handling pointer encodings */ |
412 | | enum { |
413 | | DWRF_EH_PE_absptr = 0x00, // Absolute pointer |
414 | | DWRF_EH_PE_omit = 0xff, // Omitted value |
415 | | |
416 | | /* Data type encodings */ |
417 | | DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 |
418 | | DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte |
419 | | DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte |
420 | | DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte |
421 | | DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 |
422 | | DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte |
423 | | DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte |
424 | | DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte |
425 | | DWRF_EH_PE_signed = 0x08, // Signed flag |
426 | | |
427 | | /* Reference type encodings */ |
428 | | DWRF_EH_PE_pcrel = 0x10, // PC-relative |
429 | | DWRF_EH_PE_textrel = 0x20, // Text-relative |
430 | | DWRF_EH_PE_datarel = 0x30, // Data-relative |
431 | | DWRF_EH_PE_funcrel = 0x40, // Function-relative |
432 | | DWRF_EH_PE_aligned = 0x50, // Aligned |
433 | | DWRF_EH_PE_indirect = 0x80 // Indirect |
434 | | }; |
435 | | |
436 | | /* Additional DWARF constants for debug information */ |
437 | | enum { DWRF_TAG_compile_unit = 0x11 }; |
438 | | enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; |
439 | | enum { |
440 | | DWRF_AT_name = 0x03, // Name attribute |
441 | | DWRF_AT_stmt_list = 0x10, // Statement list |
442 | | DWRF_AT_low_pc = 0x11, // Low PC address |
443 | | DWRF_AT_high_pc = 0x12 // High PC address |
444 | | }; |
445 | | enum { |
446 | | DWRF_FORM_addr = 0x01, // Address form |
447 | | DWRF_FORM_data4 = 0x06, // 4-byte data |
448 | | DWRF_FORM_string = 0x08 // String form |
449 | | }; |
450 | | |
451 | | /* Line number program opcodes */ |
452 | | enum { |
453 | | DWRF_LNS_extended_op = 0, // Extended opcode |
454 | | DWRF_LNS_copy = 1, // Copy operation |
455 | | DWRF_LNS_advance_pc = 2, // Advance program counter |
456 | | DWRF_LNS_advance_line = 3 // Advance line number |
457 | | }; |
458 | | |
459 | | /* Line number extended opcodes */ |
460 | | enum { |
461 | | DWRF_LNE_end_sequence = 1, // End of sequence |
462 | | DWRF_LNE_set_address = 2 // Set address |
463 | | }; |
464 | | |
465 | | /* |
466 | | * Architecture-specific DWARF register numbers |
467 | | * |
468 | | * These constants define the register numbering scheme used by DWARF |
469 | | * for each supported architecture. The numbers must match the ABI |
470 | | * specification for proper stack unwinding. |
471 | | */ |
472 | | enum { |
473 | | #ifdef __x86_64__ |
474 | | /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ |
475 | | DWRF_REG_AX, // RAX |
476 | | DWRF_REG_DX, // RDX |
477 | | DWRF_REG_CX, // RCX |
478 | | DWRF_REG_BX, // RBX |
479 | | DWRF_REG_SI, // RSI |
480 | | DWRF_REG_DI, // RDI |
481 | | DWRF_REG_BP, // RBP |
482 | | DWRF_REG_SP, // RSP |
483 | | DWRF_REG_8, // R8 |
484 | | DWRF_REG_9, // R9 |
485 | | DWRF_REG_10, // R10 |
486 | | DWRF_REG_11, // R11 |
487 | | DWRF_REG_12, // R12 |
488 | | DWRF_REG_13, // R13 |
489 | | DWRF_REG_14, // R14 |
490 | | DWRF_REG_15, // R15 |
491 | | DWRF_REG_RA, // Return address (RIP) |
492 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
493 | | /* AArch64 register numbering */ |
494 | | DWRF_REG_FP = 29, // Frame Pointer |
495 | | DWRF_REG_RA = 30, // Link register (return address) |
496 | | DWRF_REG_SP = 31, // Stack pointer |
497 | | #else |
498 | | # error "Unsupported target architecture" |
499 | | #endif |
500 | | }; |
501 | | |
502 | | /* DWARF encoding constants used in EH frame headers */ |
503 | | static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data |
504 | | static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data |
505 | | static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding |
506 | | static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding |
507 | | |
508 | | // ============================================================================= |
509 | | // ELF OBJECT CONTEXT |
510 | | // ============================================================================= |
511 | | |
512 | | /* |
513 | | * Context for building ELF/DWARF structures |
514 | | * |
515 | | * This structure maintains state while constructing DWARF unwind information. |
516 | | * It acts as a simple buffer manager with pointers to track current position |
517 | | * and important landmarks within the buffer. |
518 | | */ |
519 | | typedef struct ELFObjectContext { |
520 | | uint8_t* p; // Current write position in buffer |
521 | | uint8_t* startp; // Start of buffer (for offset calculations) |
522 | | uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) |
523 | | uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) |
524 | | uint32_t code_size; // Size of the code being described |
525 | | } ELFObjectContext; |
526 | | |
527 | | /* |
528 | | * EH Frame Header structure for DWARF unwinding |
529 | | * |
530 | | * This structure provides metadata about the DWARF unwinding information |
531 | | * that follows. It's required by the perf jitdump format to enable proper |
532 | | * stack unwinding during profiling. |
533 | | */ |
534 | | typedef struct { |
535 | | unsigned char version; // EH frame version (always 1) |
536 | | unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer |
537 | | unsigned char fde_count_enc; // Encoding of FDE count |
538 | | unsigned char table_enc; // Encoding of table entries |
539 | | int32_t eh_frame_ptr; // Pointer to EH frame data |
540 | | int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) |
541 | | int32_t from; // Start address of code range |
542 | | int32_t to; // End address of code range |
543 | | } EhFrameHeader; |
544 | | |
545 | | // ============================================================================= |
546 | | // DWARF GENERATION UTILITIES |
547 | | // ============================================================================= |
548 | | |
549 | | /* |
550 | | * Append a null-terminated string to the ELF context buffer |
551 | | * |
552 | | * Args: |
553 | | * ctx: ELF object context |
554 | | * str: String to append (must be null-terminated) |
555 | | * |
556 | | * Returns: Offset from start of buffer where string was written |
557 | | */ |
558 | 0 | static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { |
559 | 0 | uint8_t* p = ctx->p; |
560 | 0 | uint32_t ofs = (uint32_t)(p - ctx->startp); |
561 | | |
562 | | /* Copy string including null terminator */ |
563 | 0 | do { |
564 | 0 | *p++ = (uint8_t)*str; |
565 | 0 | } while (*str++); |
566 | |
|
567 | 0 | ctx->p = p; |
568 | 0 | return ofs; |
569 | 0 | } |
570 | | |
571 | | /* |
572 | | * Append a SLEB128 (Signed Little Endian Base 128) value |
573 | | * |
574 | | * SLEB128 is a variable-length encoding used extensively in DWARF. |
575 | | * It efficiently encodes small numbers in fewer bytes. |
576 | | * |
577 | | * Args: |
578 | | * ctx: ELF object context |
579 | | * v: Signed value to encode |
580 | | */ |
581 | 0 | static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { |
582 | 0 | uint8_t* p = ctx->p; |
583 | | |
584 | | /* Encode 7 bits at a time, with continuation bit in MSB */ |
585 | 0 | for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { |
586 | 0 | *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit |
587 | 0 | } |
588 | 0 | *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit |
589 | |
|
590 | 0 | ctx->p = p; |
591 | 0 | } |
592 | | |
593 | | /* |
594 | | * Append a ULEB128 (Unsigned Little Endian Base 128) value |
595 | | * |
596 | | * Similar to SLEB128 but for unsigned values. |
597 | | * |
598 | | * Args: |
599 | | * ctx: ELF object context |
600 | | * v: Unsigned value to encode |
601 | | */ |
602 | 0 | static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { |
603 | 0 | uint8_t* p = ctx->p; |
604 | | |
605 | | /* Encode 7 bits at a time, with continuation bit in MSB */ |
606 | 0 | for (; v >= 0x80; v >>= 7) { |
607 | 0 | *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit |
608 | 0 | } |
609 | 0 | *p++ = (char)v; // Final byte without continuation bit |
610 | |
|
611 | 0 | ctx->p = p; |
612 | 0 | } |
613 | | |
614 | | /* |
615 | | * Macros for generating DWARF structures |
616 | | * |
617 | | * These macros provide a convenient way to write various data types |
618 | | * to the DWARF buffer while automatically advancing the pointer. |
619 | | */ |
620 | | #define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit |
621 | | #define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit |
622 | | #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit |
623 | | #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit |
624 | | #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address |
625 | | #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 |
626 | | #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 |
627 | | #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string |
628 | | |
629 | | /* Align to specified boundary with NOP instructions */ |
630 | | #define DWRF_ALIGNNOP(s) \ |
631 | | while ((uintptr_t)p & ((s)-1)) { \ |
632 | | *p++ = DWRF_CFA_nop; \ |
633 | | } |
634 | | |
635 | | /* Write a DWARF section with automatic size calculation */ |
636 | | #define DWRF_SECTION(name, stmt) \ |
637 | 0 | { \ |
638 | 0 | uint32_t* szp_##name = (uint32_t*)p; \ |
639 | 0 | p += 4; \ |
640 | 0 | stmt; \ |
641 | 0 | *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ |
642 | 0 | } |
643 | | |
644 | | // ============================================================================= |
645 | | // DWARF EH FRAME GENERATION |
646 | | // ============================================================================= |
647 | | |
648 | | static void elf_init_ehframe(ELFObjectContext* ctx); |
649 | | |
650 | | /* |
651 | | * Initialize DWARF .eh_frame section for a code region |
652 | | * |
653 | | * The .eh_frame section contains Call Frame Information (CFI) that describes |
654 | | * how to unwind the stack at any point in the code. This is essential for |
655 | | * proper profiling as it allows perf to generate accurate call graphs. |
656 | | * |
657 | | * The function generates two main components: |
658 | | * 1. CIE (Common Information Entry) - describes calling conventions |
659 | | * 2. FDE (Frame Description Entry) - describes specific function unwinding |
660 | | * |
661 | | * Args: |
662 | | * ctx: ELF object context containing code size and buffer pointers |
663 | | */ |
664 | 0 | static size_t calculate_eh_frame_size(void) { |
665 | | /* Calculate the EH frame size for the trampoline function */ |
666 | 0 | extern void *_Py_trampoline_func_start; |
667 | 0 | extern void *_Py_trampoline_func_end; |
668 | |
|
669 | 0 | size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; |
670 | |
|
671 | 0 | ELFObjectContext ctx; |
672 | 0 | char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
673 | 0 | ctx.code_size = code_size; |
674 | 0 | ctx.startp = ctx.p = (uint8_t*)buffer; |
675 | 0 | ctx.fde_p = NULL; |
676 | |
|
677 | 0 | elf_init_ehframe(&ctx); |
678 | 0 | return ctx.p - ctx.startp; |
679 | 0 | } |
680 | | |
681 | 0 | static void elf_init_ehframe(ELFObjectContext* ctx) { |
682 | 0 | uint8_t* p = ctx->p; |
683 | 0 | uint8_t* framep = p; // Remember start of frame data |
684 | | |
685 | | /* |
686 | | * DWARF Unwind Table for Trampoline Function |
687 | | * |
688 | | * This section defines DWARF Call Frame Information (CFI) using encoded macros |
689 | | * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function |
690 | | * preserves and restores registers. This is used by profiling tools (e.g., `perf`) |
691 | | * and debuggers for stack unwinding in JIT-compiled code. |
692 | | * |
693 | | * ------------------------------------------------- |
694 | | * TO REGENERATE THIS TABLE FROM GCC OBJECTS: |
695 | | * ------------------------------------------------- |
696 | | * |
697 | | * 1. Create a trampoline source file (e.g., `trampoline.c`): |
698 | | * |
699 | | * #include <Python.h> |
700 | | * typedef PyObject* (*py_evaluator)(void*, void*, int); |
701 | | * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { |
702 | | * return evaluator(ts, f, throwflag); |
703 | | * } |
704 | | * |
705 | | * 2. Compile to an object file with frame pointer preservation: |
706 | | * |
707 | | * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
708 | | * |
709 | | * 3. Extract DWARF unwind info from the object file: |
710 | | * |
711 | | * readelf -w trampoline.o |
712 | | * |
713 | | * Example output from `.eh_frame`: |
714 | | * |
715 | | * 00000000 CIE |
716 | | * Version: 1 |
717 | | * Augmentation: "zR" |
718 | | * Code alignment factor: 4 |
719 | | * Data alignment factor: -8 |
720 | | * Return address column: 30 |
721 | | * DW_CFA_def_cfa: r31 (sp) ofs 0 |
722 | | * |
723 | | * 00000014 FDE cie=00000000 pc=0..14 |
724 | | * DW_CFA_advance_loc: 4 |
725 | | * DW_CFA_def_cfa_offset: 16 |
726 | | * DW_CFA_offset: r29 at cfa-16 |
727 | | * DW_CFA_offset: r30 at cfa-8 |
728 | | * DW_CFA_advance_loc: 12 |
729 | | * DW_CFA_restore: r30 |
730 | | * DW_CFA_restore: r29 |
731 | | * DW_CFA_def_cfa_offset: 0 |
732 | | * |
733 | | * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. |
734 | | * |
735 | | * ---------------------------------- |
736 | | * HOW TO TRANSLATE TO DWRF_* MACROS: |
737 | | * ---------------------------------- |
738 | | * |
739 | | * After compiling your trampoline with: |
740 | | * |
741 | | * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
742 | | * |
743 | | * run: |
744 | | * |
745 | | * readelf -w trampoline.o |
746 | | * |
747 | | * to inspect the generated `.eh_frame` data. You will see two main components: |
748 | | * |
749 | | * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. |
750 | | * 2. An FDE (Frame Description Entry): function-specific unwind instructions. |
751 | | * |
752 | | * --------------------- |
753 | | * Translating the CIE: |
754 | | * --------------------- |
755 | | * From `readelf -w`, you might see: |
756 | | * |
757 | | * 00000000 0000000000000010 00000000 CIE |
758 | | * Version: 1 |
759 | | * Augmentation: "zR" |
760 | | * Code alignment factor: 4 |
761 | | * Data alignment factor: -8 |
762 | | * Return address column: 30 |
763 | | * Augmentation data: 1b |
764 | | * DW_CFA_def_cfa: r31 (sp) ofs 0 |
765 | | * |
766 | | * Map this to: |
767 | | * |
768 | | * DWRF_SECTION(CIE, |
769 | | * DWRF_U32(0); // CIE ID (always 0 for CIEs) |
770 | | * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 |
771 | | * DWRF_STR("zR"); // Augmentation string "zR" |
772 | | * DWRF_UV(4); // Code alignment factor = 4 |
773 | | * DWRF_SV(-8); // Data alignment factor = -8 |
774 | | * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) |
775 | | * DWRF_UV(1); // Augmentation data length = 1 |
776 | | * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers |
777 | | * |
778 | | * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa |
779 | | * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) |
780 | | * DWRF_UV(0); // Offset = 0 |
781 | | * |
782 | | * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary |
783 | | * ) |
784 | | * |
785 | | * Notes: |
786 | | * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. |
787 | | * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. |
788 | | * |
789 | | * --------------------- |
790 | | * Translating the FDE: |
791 | | * --------------------- |
792 | | * From `readelf -w`: |
793 | | * |
794 | | * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 |
795 | | * DW_CFA_advance_loc: 4 |
796 | | * DW_CFA_def_cfa_offset: 16 |
797 | | * DW_CFA_offset: r29 at cfa-16 |
798 | | * DW_CFA_offset: r30 at cfa-8 |
799 | | * DW_CFA_advance_loc: 12 |
800 | | * DW_CFA_restore: r30 |
801 | | * DW_CFA_restore: r29 |
802 | | * DW_CFA_def_cfa_offset: 0 |
803 | | * |
804 | | * Map the FDE header and instructions to: |
805 | | * |
806 | | * DWRF_SECTION(FDE, |
807 | | * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) |
808 | | * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) |
809 | | * DWRF_U32(ctx->code_size); // Code range covered by this FDE |
810 | | * DWRF_U8(0); // Augmentation data length (none) |
811 | | * |
812 | | * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) |
813 | | * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 |
814 | | * DWRF_UV(16); |
815 | | * |
816 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) |
817 | | * DWRF_UV(2); // At offset 2 * 8 = 16 bytes |
818 | | * |
819 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) |
820 | | * DWRF_UV(1); // At offset 1 * 8 = 8 bytes |
821 | | * |
822 | | * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) |
823 | | * |
824 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 |
825 | | * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 |
826 | | * |
827 | | * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP |
828 | | * DWRF_UV(0); |
829 | | * ) |
830 | | * |
831 | | * To regenerate: |
832 | | * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. |
833 | | * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as |
834 | | * the code is in a different address space every time. |
835 | | * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: |
836 | | * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) |
837 | | * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) |
838 | | * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset |
839 | | * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) |
840 | | * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. |
841 | | * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. |
842 | | */ |
843 | | |
844 | | /* |
845 | | * Emit DWARF EH CIE (Common Information Entry) |
846 | | * |
847 | | * The CIE describes the calling conventions and basic unwinding rules |
848 | | * that apply to all functions in this compilation unit. |
849 | | */ |
850 | 0 | DWRF_SECTION(CIE, |
851 | 0 | DWRF_U32(0); // CIE ID (0 indicates this is a CIE) |
852 | 0 | DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) |
853 | 0 | DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) |
854 | 0 | #ifdef __x86_64__ |
855 | 0 | DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) |
856 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
857 | | DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) |
858 | | #endif |
859 | 0 | DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) |
860 | 0 | DWRF_U8(DWRF_REG_RA); // Return address register number |
861 | 0 | DWRF_UV(1); // Augmentation data length |
862 | 0 | DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding |
863 | | |
864 | | /* Initial CFI instructions - describe default calling convention */ |
865 | 0 | #ifdef __x86_64__ |
866 | | /* x86_64 initial CFI state */ |
867 | 0 | DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) |
868 | 0 | DWRF_UV(DWRF_REG_SP); // CFA = SP register |
869 | 0 | DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size |
870 | 0 | DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved |
871 | 0 | DWRF_UV(1); // At offset 1 from CFA |
872 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
873 | | /* AArch64 initial CFI state */ |
874 | | DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) |
875 | | DWRF_UV(DWRF_REG_SP); // CFA = SP register |
876 | | DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) |
877 | | // No initial register saves in AArch64 CIE |
878 | | #endif |
879 | 0 | DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
880 | 0 | ) |
881 | |
|
882 | 0 | ctx->eh_frame_p = p; // Remember start of FDE data |
883 | | |
884 | | /* |
885 | | * Emit DWARF EH FDE (Frame Description Entry) |
886 | | * |
887 | | * The FDE describes unwinding information specific to this function. |
888 | | * It references the CIE and provides function-specific CFI instructions. |
889 | | * |
890 | | * The PC-relative offset is calculated after the entire EH frame is built |
891 | | * to ensure accurate positioning relative to the synthesized DSO layout. |
892 | | */ |
893 | 0 | DWRF_SECTION(FDE, |
894 | 0 | DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) |
895 | 0 | ctx->fde_p = p; // Remember where PC offset field is located for later calculation |
896 | 0 | DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) |
897 | 0 | DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) |
898 | 0 | DWRF_U8(0); // Augmentation data length (none) |
899 | | |
900 | | /* |
901 | | * Architecture-specific CFI instructions |
902 | | * |
903 | | * These instructions describe how registers are saved and restored |
904 | | * during function calls. Each architecture has different calling |
905 | | * conventions and register usage patterns. |
906 | | */ |
907 | 0 | #ifdef __x86_64__ |
908 | | /* x86_64 calling convention unwinding rules with frame pointer */ |
909 | | # if defined(__CET__) && (__CET__ & 1) |
910 | | DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) |
911 | | # endif |
912 | 0 | DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) |
913 | 0 | DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 |
914 | 0 | DWRF_UV(16); // New offset: SP + 16 |
915 | 0 | DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 |
916 | 0 | DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes |
917 | 0 | DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) |
918 | 0 | DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 |
919 | 0 | DWRF_UV(DWRF_REG_BP); // Use base pointer register |
920 | 0 | DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 |
921 | 0 | DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 |
922 | 0 | DWRF_UV(DWRF_REG_SP); // Use stack pointer register |
923 | 0 | DWRF_UV(8); // New offset: SP + 8 |
924 | | #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
925 | | /* AArch64 calling convention unwinding rules */ |
926 | | DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) |
927 | | DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 |
928 | | DWRF_UV(16); // Stack pointer moved by 16 bytes |
929 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved |
930 | | DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) |
931 | | DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved |
932 | | DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) |
933 | | DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) |
934 | | DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! |
935 | | DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! |
936 | | DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) |
937 | | DWRF_UV(0); // Back to original stack position |
938 | | #else |
939 | | # error "Unsupported target architecture" |
940 | | #endif |
941 | |
|
942 | 0 | DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
943 | 0 | ) |
944 | |
|
945 | 0 | ctx->p = p; // Update context pointer to end of generated data |
946 | | |
947 | | /* Calculate and update the PC-relative offset in the FDE |
948 | | * |
949 | | * When perf processes the jitdump, it creates a synthesized DSO with this layout: |
950 | | * |
951 | | * Synthesized DSO Memory Layout: |
952 | | * ┌─────────────────────────────────────────────────────────────┐ < code_start |
953 | | * │ Code Section │ |
954 | | * │ (round_up(code_size, 8) bytes) │ |
955 | | * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data |
956 | | * │ EH Frame Data │ |
957 | | * │ ┌─────────────────────────────────────────────────────┐ │ |
958 | | * │ │ CIE data │ │ |
959 | | * │ └─────────────────────────────────────────────────────┘ │ |
960 | | * │ ┌─────────────────────────────────────────────────────┐ │ |
961 | | * │ │ FDE Header: │ │ |
962 | | * │ │ - CIE offset (4 bytes) │ │ |
963 | | * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start |
964 | | * │ │ - address range (4 bytes) │ │ (this specific field) |
965 | | * │ │ CFI Instructions... │ │ |
966 | | * │ └─────────────────────────────────────────────────────┘ │ |
967 | | * ├─────────────────────────────────────────────────────────────┤ < reference_point |
968 | | * │ EhFrameHeader │ |
969 | | * │ (navigation metadata) │ |
970 | | * └─────────────────────────────────────────────────────────────┘ |
971 | | * |
972 | | * The PC offset field in the FDE must contain the distance from itself to code_start: |
973 | | * |
974 | | * distance = code_start - fde_pc_field |
975 | | * |
976 | | * Where: |
977 | | * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame |
978 | | * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) |
979 | | * |
980 | | * Therefore: |
981 | | * distance = code_start_location - fde_pc_field_location |
982 | | * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) |
983 | | * = -rounded_code_size - fde_offset_in_frame |
984 | | * = -(round_up(code_size, 8) + fde_offset_in_frame) |
985 | | * |
986 | | * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, |
987 | | * |
988 | | */ |
989 | 0 | if (ctx->fde_p != NULL) { |
990 | 0 | int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); |
991 | 0 | int32_t rounded_code_size = round_up(ctx->code_size, 8); |
992 | 0 | int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); |
993 | | |
994 | | |
995 | | // Update the PC-relative offset in the FDE |
996 | 0 | *(int32_t*)ctx->fde_p = pc_relative_offset; |
997 | 0 | } |
998 | 0 | } |
999 | | |
1000 | | // ============================================================================= |
1001 | | // JITDUMP INITIALIZATION |
1002 | | // ============================================================================= |
1003 | | |
1004 | | /* |
1005 | | * Initialize the perf jitdump interface |
1006 | | * |
1007 | | * This function sets up everything needed to generate jitdump files: |
1008 | | * 1. Creates the jitdump file with a unique name |
1009 | | * 2. Maps the first page to signal perf that we're using the interface |
1010 | | * 3. Writes the jitdump header |
1011 | | * 4. Initializes synchronization primitives |
1012 | | * |
1013 | | * The memory mapping is crucial - perf detects jitdump files by scanning |
1014 | | * for processes that have mapped files matching the pattern /tmp/jit-*.dump |
1015 | | * |
1016 | | * Returns: Pointer to initialized state, or NULL on failure |
1017 | | */ |
1018 | 0 | static void* perf_map_jit_init(void) { |
1019 | 0 | char filename[100]; |
1020 | 0 | int pid = getpid(); |
1021 | | |
1022 | | /* Create unique filename based on process ID */ |
1023 | 0 | snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
1024 | | |
1025 | | /* Create/open the jitdump file with appropriate permissions */ |
1026 | 0 | const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
1027 | 0 | if (fd == -1) { |
1028 | 0 | return NULL; // Failed to create file |
1029 | 0 | } |
1030 | | |
1031 | | /* Get system page size for memory mapping */ |
1032 | 0 | const long page_size = sysconf(_SC_PAGESIZE); |
1033 | 0 | if (page_size == -1) { |
1034 | 0 | close(fd); |
1035 | 0 | return NULL; // Failed to get page size |
1036 | 0 | } |
1037 | | |
1038 | | /* |
1039 | | * Map the first page of the jitdump file |
1040 | | * |
1041 | | * This memory mapping serves as a signal to perf that this process |
1042 | | * is generating JIT code. Perf scans /proc/.../maps looking for mapped |
1043 | | * files that match the jitdump naming pattern. |
1044 | | * |
1045 | | * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. |
1046 | | */ |
1047 | 0 | perf_jit_map_state.mapped_buffer = mmap( |
1048 | 0 | NULL, // Let kernel choose address |
1049 | 0 | page_size, // Map one page |
1050 | 0 | PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) |
1051 | 0 | MAP_PRIVATE, // Private mapping |
1052 | 0 | fd, // File descriptor |
1053 | 0 | 0 // Offset 0 (first page) |
1054 | 0 | ); |
1055 | |
|
1056 | 0 | if (perf_jit_map_state.mapped_buffer == NULL) { |
1057 | 0 | close(fd); |
1058 | 0 | return NULL; // Memory mapping failed |
1059 | 0 | } |
1060 | | |
1061 | 0 | perf_jit_map_state.mapped_size = page_size; |
1062 | | |
1063 | | /* Convert file descriptor to FILE* for easier I/O operations */ |
1064 | 0 | perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
1065 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
1066 | 0 | close(fd); |
1067 | 0 | return NULL; // Failed to create FILE* |
1068 | 0 | } |
1069 | | |
1070 | | /* |
1071 | | * Set up file buffering for better performance |
1072 | | * |
1073 | | * We use a large buffer (2MB) because jitdump files can be written |
1074 | | * frequently during program execution. Buffering reduces system call |
1075 | | * overhead and improves overall performance. |
1076 | | */ |
1077 | 0 | setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
1078 | | |
1079 | | /* Write the jitdump file header */ |
1080 | 0 | perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
1081 | | |
1082 | | /* |
1083 | | * Initialize thread synchronization lock |
1084 | | * |
1085 | | * Multiple threads may attempt to write to the jitdump file |
1086 | | * simultaneously. This lock ensures thread-safe access to the |
1087 | | * global jitdump state. |
1088 | | */ |
1089 | 0 | perf_jit_map_state.map_lock = PyThread_allocate_lock(); |
1090 | 0 | if (perf_jit_map_state.map_lock == NULL) { |
1091 | 0 | fclose(perf_jit_map_state.perf_map); |
1092 | 0 | return NULL; // Failed to create lock |
1093 | 0 | } |
1094 | | |
1095 | | /* Initialize code ID counter */ |
1096 | 0 | perf_jit_map_state.code_id = 0; |
1097 | | |
1098 | | /* Calculate padding size based on actual unwind info requirements */ |
1099 | 0 | size_t eh_frame_size = calculate_eh_frame_size(); |
1100 | 0 | size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
1101 | 0 | trampoline_api.code_padding = round_up(unwind_data_size, 16); |
1102 | 0 | trampoline_api.code_alignment = 32; |
1103 | |
|
1104 | 0 | return &perf_jit_map_state; |
1105 | 0 | } |
1106 | | |
1107 | | // ============================================================================= |
1108 | | // MAIN JITDUMP ENTRY WRITING |
1109 | | // ============================================================================= |
1110 | | |
1111 | | /* |
1112 | | * Write a complete jitdump entry for a Python function |
1113 | | * |
1114 | | * This is the main function called by Python's trampoline system whenever |
1115 | | * a new piece of JIT-compiled code needs to be recorded. It writes both |
1116 | | * the unwinding information and the code load event to the jitdump file. |
1117 | | * |
1118 | | * The function performs these steps: |
1119 | | * 1. Initialize jitdump system if not already done |
1120 | | * 2. Extract function name and filename from Python code object |
1121 | | * 3. Generate DWARF unwinding information |
1122 | | * 4. Write unwinding info event to jitdump file |
1123 | | * 5. Write code load event to jitdump file |
1124 | | * |
1125 | | * Args: |
1126 | | * state: Jitdump state (currently unused, uses global state) |
1127 | | * code_addr: Address where the compiled code resides |
1128 | | * code_size: Size of the compiled code in bytes |
1129 | | * co: Python code object containing metadata |
1130 | | * |
1131 | | * IMPORTANT: This function signature is part of Python's internal API |
1132 | | * and must not be changed without coordinating with core Python development. |
1133 | | */ |
1134 | | static void perf_map_jit_write_entry(void *state, const void *code_addr, |
1135 | | unsigned int code_size, PyCodeObject *co) |
1136 | 0 | { |
1137 | | /* Initialize jitdump system on first use */ |
1138 | 0 | if (perf_jit_map_state.perf_map == NULL) { |
1139 | 0 | void* ret = perf_map_jit_init(); |
1140 | 0 | if(ret == NULL){ |
1141 | 0 | return; // Initialization failed, silently abort |
1142 | 0 | } |
1143 | 0 | } |
1144 | | |
1145 | | /* |
1146 | | * Extract function information from Python code object |
1147 | | * |
1148 | | * We create a human-readable function name by combining the qualified |
1149 | | * name (includes class/module context) with the filename. This helps |
1150 | | * developers identify functions in perf reports. |
1151 | | */ |
1152 | 0 | const char *entry = ""; |
1153 | 0 | if (co->co_qualname != NULL) { |
1154 | 0 | entry = PyUnicode_AsUTF8(co->co_qualname); |
1155 | 0 | } |
1156 | |
|
1157 | 0 | const char *filename = ""; |
1158 | 0 | if (co->co_filename != NULL) { |
1159 | 0 | filename = PyUnicode_AsUTF8(co->co_filename); |
1160 | 0 | } |
1161 | | |
1162 | | /* |
1163 | | * Create formatted function name for perf display |
1164 | | * |
1165 | | * Format: "py::<function_name>:<filename>" |
1166 | | * The "py::" prefix helps identify Python functions in mixed-language |
1167 | | * profiles (e.g., when profiling C extensions alongside Python code). |
1168 | | */ |
1169 | 0 | size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
1170 | 0 | char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
1171 | 0 | if (perf_map_entry == NULL) { |
1172 | 0 | return; // Memory allocation failed |
1173 | 0 | } |
1174 | 0 | snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
1175 | |
|
1176 | 0 | const size_t name_length = strlen(perf_map_entry); |
1177 | 0 | uword base = (uword)code_addr; |
1178 | 0 | uword size = code_size; |
1179 | | |
1180 | | /* |
1181 | | * Generate DWARF unwinding information |
1182 | | * |
1183 | | * DWARF data is essential for proper stack unwinding during profiling. |
1184 | | * Without it, perf cannot generate accurate call graphs, especially |
1185 | | * in optimized code where frame pointers may be omitted. |
1186 | | */ |
1187 | 0 | ELFObjectContext ctx; |
1188 | 0 | char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
1189 | 0 | ctx.code_size = code_size; |
1190 | 0 | ctx.startp = ctx.p = (uint8_t*)buffer; |
1191 | 0 | ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written |
1192 | | |
1193 | | /* Generate EH frame (Exception Handling frame) data */ |
1194 | 0 | elf_init_ehframe(&ctx); |
1195 | 0 | int eh_frame_size = ctx.p - ctx.startp; |
1196 | | |
1197 | | /* |
1198 | | * Write Code Unwinding Information Event |
1199 | | * |
1200 | | * This event must be written before the code load event to ensure |
1201 | | * perf has the unwinding information available when it processes |
1202 | | * the code region. |
1203 | | */ |
1204 | 0 | CodeUnwindingInfoEvent ev2; |
1205 | 0 | ev2.base.event = PerfUnwindingInfo; |
1206 | 0 | ev2.base.time_stamp = get_current_monotonic_ticks(); |
1207 | 0 | ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
1208 | | |
1209 | | /* Verify we don't exceed our padding budget */ |
1210 | 0 | assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); |
1211 | |
|
1212 | 0 | ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
1213 | 0 | ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment |
1214 | | |
1215 | | /* Calculate total event size with padding */ |
1216 | 0 | int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; |
1217 | 0 | int padding_size = round_up(content_size, 8) - content_size; // 8-byte align |
1218 | 0 | ev2.base.size = content_size + padding_size; |
1219 | | |
1220 | | /* Write the unwinding info event header */ |
1221 | 0 | perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
1222 | | |
1223 | | /* |
1224 | | * Write EH Frame Header |
1225 | | * |
1226 | | * The EH frame header provides metadata about the DWARF unwinding |
1227 | | * information that follows. It includes pointers and counts that |
1228 | | * help perf navigate the unwinding data efficiently. |
1229 | | */ |
1230 | 0 | EhFrameHeader f; |
1231 | 0 | f.version = 1; |
1232 | 0 | f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte |
1233 | 0 | f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count |
1234 | 0 | f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte |
1235 | | |
1236 | | /* Calculate relative offsets for EH frame navigation */ |
1237 | 0 | f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); |
1238 | 0 | f.eh_fde_count = 1; // We generate exactly one FDE per function |
1239 | 0 | f.from = -(round_up(code_size, 8) + eh_frame_size); |
1240 | |
|
1241 | 0 | int cie_size = ctx.eh_frame_p - ctx.startp; |
1242 | 0 | f.to = -(eh_frame_size - cie_size); |
1243 | | |
1244 | | /* Write EH frame data and header */ |
1245 | 0 | perf_map_jit_write_fully(ctx.startp, eh_frame_size); |
1246 | 0 | perf_map_jit_write_fully(&f, sizeof(f)); |
1247 | | |
1248 | | /* Write padding to maintain alignment */ |
1249 | 0 | char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
1250 | 0 | perf_map_jit_write_fully(&padding_bytes, padding_size); |
1251 | | |
1252 | | /* |
1253 | | * Write Code Load Event |
1254 | | * |
1255 | | * This event tells perf about the new code region. It includes: |
1256 | | * - Memory addresses and sizes |
1257 | | * - Process and thread identification |
1258 | | * - Function name for symbol resolution |
1259 | | * - The actual machine code bytes |
1260 | | */ |
1261 | 0 | CodeLoadEvent ev; |
1262 | 0 | ev.base.event = PerfLoad; |
1263 | 0 | ev.base.size = sizeof(ev) + (name_length+1) + size; |
1264 | 0 | ev.base.time_stamp = get_current_monotonic_ticks(); |
1265 | 0 | ev.process_id = getpid(); |
1266 | 0 | ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call |
1267 | 0 | ev.vma = base; // Virtual memory address |
1268 | 0 | ev.code_address = base; // Same as VMA for our use case |
1269 | 0 | ev.code_size = size; |
1270 | | |
1271 | | /* Assign unique code ID and increment counter */ |
1272 | 0 | perf_jit_map_state.code_id += 1; |
1273 | 0 | ev.code_id = perf_jit_map_state.code_id; |
1274 | | |
1275 | | /* Write code load event and associated data */ |
1276 | 0 | perf_map_jit_write_fully(&ev, sizeof(ev)); |
1277 | 0 | perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator |
1278 | 0 | perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code |
1279 | | |
1280 | | /* Clean up allocated memory */ |
1281 | 0 | PyMem_RawFree(perf_map_entry); |
1282 | 0 | } |
1283 | | |
1284 | | // ============================================================================= |
1285 | | // CLEANUP AND FINALIZATION |
1286 | | // ============================================================================= |
1287 | | |
1288 | | /* |
1289 | | * Finalize and cleanup the perf jitdump system |
1290 | | * |
1291 | | * This function is called when Python is shutting down or when the |
1292 | | * perf trampoline system is being disabled. It ensures all resources |
1293 | | * are properly released and all buffered data is flushed to disk. |
1294 | | * |
1295 | | * Args: |
1296 | | * state: Jitdump state (currently unused, uses global state) |
1297 | | * |
1298 | | * Returns: 0 on success |
1299 | | * |
1300 | | * IMPORTANT: This function signature is part of Python's internal API |
1301 | | * and must not be changed without coordinating with core Python development. |
1302 | | */ |
1303 | 0 | static int perf_map_jit_fini(void* state) { |
1304 | | /* |
1305 | | * Close jitdump file with proper synchronization |
1306 | | * |
1307 | | * We need to acquire the lock to ensure no other threads are |
1308 | | * writing to the file when we close it. This prevents corruption |
1309 | | * and ensures all data is properly flushed. |
1310 | | */ |
1311 | 0 | if (perf_jit_map_state.perf_map != NULL) { |
1312 | 0 | PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); |
1313 | 0 | fclose(perf_jit_map_state.perf_map); // This also flushes buffers |
1314 | 0 | PyThread_release_lock(perf_jit_map_state.map_lock); |
1315 | | |
1316 | | /* Clean up synchronization primitive */ |
1317 | 0 | PyThread_free_lock(perf_jit_map_state.map_lock); |
1318 | 0 | perf_jit_map_state.perf_map = NULL; |
1319 | 0 | } |
1320 | | |
1321 | | /* |
1322 | | * Unmap the memory region |
1323 | | * |
1324 | | * This removes the signal to perf that we were generating JIT code. |
1325 | | * After this point, perf will no longer detect this process as |
1326 | | * having JIT capabilities. |
1327 | | */ |
1328 | 0 | if (perf_jit_map_state.mapped_buffer != NULL) { |
1329 | 0 | munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
1330 | 0 | perf_jit_map_state.mapped_buffer = NULL; |
1331 | 0 | } |
1332 | | |
1333 | | /* Clear global state reference */ |
1334 | 0 | trampoline_api.state = NULL; |
1335 | |
|
1336 | 0 | return 0; // Success |
1337 | 0 | } |
1338 | | |
1339 | | // ============================================================================= |
1340 | | // PUBLIC API EXPORT |
1341 | | // ============================================================================= |
1342 | | |
1343 | | /* |
1344 | | * Python Perf Callbacks Structure |
1345 | | * |
1346 | | * This structure defines the callback interface that Python's trampoline |
1347 | | * system uses to integrate with perf profiling. It contains function |
1348 | | * pointers for initialization, event writing, and cleanup. |
1349 | | * |
1350 | | * CRITICAL: This structure and its contents are part of Python's internal |
1351 | | * API. The function signatures and behavior must remain stable to maintain |
1352 | | * compatibility with the Python interpreter's perf integration system. |
1353 | | * |
1354 | | * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h |
1355 | | */ |
1356 | | _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
1357 | | &perf_map_jit_init, // Initialization function |
1358 | | &perf_map_jit_write_entry, // Event writing function |
1359 | | &perf_map_jit_fini, // Cleanup function |
1360 | | }; |
1361 | | |
1362 | | #endif /* PY_HAVE_PERF_TRAMPOLINE */ |