Coverage Report

Created: 2026-02-26 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/optimizer.c
Line
Count
Source
1
#include "Python.h"
2
3
#ifdef _Py_TIER2
4
5
#include "opcode.h"
6
#include "pycore_interp.h"
7
#include "pycore_backoff.h"
8
#include "pycore_bitutils.h"        // _Py_popcount32()
9
#include "pycore_ceval.h"       // _Py_set_eval_breaker_bit
10
#include "pycore_code.h"            // _Py_GetBaseCodeUnit
11
#include "pycore_interpframe.h"
12
#include "pycore_object.h"          // _PyObject_GC_UNTRACK()
13
#include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
14
#include "pycore_opcode_utils.h"  // MAX_REAL_OPCODE
15
#include "pycore_optimizer.h"     // _Py_uop_analyze_and_optimize()
16
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
17
#include "pycore_tuple.h"         // _PyTuple_FromArraySteal
18
#include "pycore_unicodeobject.h" // _PyUnicode_FromASCII
19
#include "pycore_uop_ids.h"
20
#include "pycore_jit.h"
21
#include <stdbool.h>
22
#include <stdint.h>
23
#include <stddef.h>
24
25
#define NEED_OPCODE_METADATA
26
#include "pycore_uop_metadata.h" // Uop tables
27
#undef NEED_OPCODE_METADATA
28
29
#define MAX_EXECUTORS_SIZE 256
30
31
// Trace too short, no progress:
32
// _START_EXECUTOR
33
// _MAKE_WARM
34
// _CHECK_VALIDITY
35
// _SET_IP
36
// is 4-5 instructions.
37
#define CODE_SIZE_NO_PROGRESS 5
38
// We start with _START_EXECUTOR, _MAKE_WARM
39
#define CODE_SIZE_EMPTY 2
40
41
#define _PyExecutorObject_CAST(op)  ((_PyExecutorObject *)(op))
42
43
#ifndef Py_GIL_DISABLED
44
static bool
45
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
46
{
47
    if (code == (PyCodeObject *)&_Py_InitCleanup) {
48
        return false;
49
    }
50
    if (instr->op.code == ENTER_EXECUTOR) {
51
        return true;
52
    }
53
    if (code->co_executors == NULL) {
54
        return true;
55
    }
56
    return code->co_executors->size < MAX_EXECUTORS_SIZE;
57
}
58
59
static int32_t
60
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
61
{
62
    if (instr->op.code == ENTER_EXECUTOR) {
63
        return instr->op.arg;
64
    }
65
    _PyExecutorArray *old = code->co_executors;
66
    int size = 0;
67
    int capacity = 0;
68
    if (old != NULL) {
69
        size = old->size;
70
        capacity = old->capacity;
71
        assert(size < MAX_EXECUTORS_SIZE);
72
    }
73
    assert(size <= capacity);
74
    if (size == capacity) {
75
        /* Array is full. Grow array */
76
        int new_capacity = capacity ? capacity * 2 : 4;
77
        _PyExecutorArray *new = PyMem_Realloc(
78
            old,
79
            offsetof(_PyExecutorArray, executors) +
80
            new_capacity * sizeof(_PyExecutorObject *));
81
        if (new == NULL) {
82
            return -1;
83
        }
84
        new->capacity = new_capacity;
85
        new->size = size;
86
        code->co_executors = new;
87
    }
88
    assert(size < code->co_executors->capacity);
89
    return size;
90
}
91
92
static void
93
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
94
{
95
    Py_INCREF(executor);
96
    if (instr->op.code == ENTER_EXECUTOR) {
97
        assert(index == instr->op.arg);
98
        _Py_ExecutorDetach(code->co_executors->executors[index]);
99
    }
100
    else {
101
        assert(code->co_executors->size == index);
102
        assert(code->co_executors->capacity > index);
103
        code->co_executors->size++;
104
    }
105
    executor->vm_data.opcode = instr->op.code;
106
    executor->vm_data.oparg = instr->op.arg;
107
    executor->vm_data.code = code;
108
    executor->vm_data.index = (int)(instr - _PyCode_CODE(code));
109
    code->co_executors->executors[index] = executor;
110
    assert(index < MAX_EXECUTORS_SIZE);
111
    instr->op.code = ENTER_EXECUTOR;
112
    instr->op.arg = index;
113
}
114
#endif // Py_GIL_DISABLED
115
116
static _PyExecutorObject *
117
make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies);
118
119
static int
120
uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate,
121
             _PyExecutorObject **exec_ptr,
122
             bool progress_needed);
123
124
/* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
125
 * If optimized, *executor_ptr contains a new reference to the executor
126
 */
127
// gh-137573: inlining this function causes stack overflows
128
Py_NO_INLINE int
129
_PyOptimizer_Optimize(
130
    _PyInterpreterFrame *frame, PyThreadState *tstate)
131
{
132
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
133
    PyInterpreterState *interp = _PyInterpreterState_GET();
134
    if (!interp->jit) {
135
        // gh-140936: It is possible that interp->jit will become false during
136
        // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT
137
        // instruction may still be present. In this case, we should
138
        // return immediately without optimization.
139
        return 0;
140
    }
141
    _PyExecutorObject *prev_executor = _tstate->jit_tracer_state->initial_state.executor;
142
    if (prev_executor != NULL && !prev_executor->vm_data.valid) {
143
        // gh-143604: If we are a side exit executor and the original executor is no
144
        // longer valid, don't compile to prevent a reference leak.
145
        return 0;
146
    }
147
    assert(!interp->compiling);
148
    assert(_tstate->jit_tracer_state->initial_state.stack_depth >= 0);
149
#ifndef Py_GIL_DISABLED
150
    assert(_tstate->jit_tracer_state->initial_state.func != NULL);
151
    interp->compiling = true;
152
    // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
153
    // make progress in order to avoid infinite loops or excessively-long
154
    // side-exit chains. We can only insert the executor into the bytecode if
155
    // this is true, since a deopt won't infinitely re-enter the executor:
156
    int chain_depth = _tstate->jit_tracer_state->initial_state.chain_depth;
157
    chain_depth %= MAX_CHAIN_DEPTH;
158
    bool progress_needed = chain_depth == 0;
159
    PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state->initial_state.code;
160
    _Py_CODEUNIT *start = _tstate->jit_tracer_state->initial_state.start_instr;
161
    if (progress_needed && !has_space_for_executor(code, start)) {
162
        interp->compiling = false;
163
        return 0;
164
    }
165
    _PyExecutorObject *executor;
166
    int err = uop_optimize(frame, tstate, &executor, progress_needed);
167
    if (err <= 0) {
168
        interp->compiling = false;
169
        return err;
170
    }
171
    assert(executor != NULL);
172
    if (progress_needed) {
173
        int index = get_index_for_executor(code, start);
174
        if (index < 0) {
175
            /* Out of memory. Don't raise and assume that the
176
             * error will show up elsewhere.
177
             *
178
             * If an optimizer has already produced an executor,
179
             * it might get confused by the executor disappearing,
180
             * but there is not much we can do about that here. */
181
            Py_DECREF(executor);
182
            interp->compiling = false;
183
            return 0;
184
        }
185
        insert_executor(code, start, index, executor);
186
    }
187
    executor->vm_data.chain_depth = chain_depth;
188
    assert(executor->vm_data.valid);
189
    _PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit;
190
    if (exit != NULL && !progress_needed) {
191
        exit->executor = executor;
192
    }
193
    else {
194
        // An executor inserted into the code object now has a strong reference
195
        // to it from the code object. Thus, we don't need this reference anymore.
196
        Py_DECREF(executor);
197
    }
198
    interp->compiling = false;
199
    return 1;
200
#else
201
    return 0;
202
#endif
203
}
204
205
static _PyExecutorObject *
206
get_executor_lock_held(PyCodeObject *code, int offset)
207
{
208
    int code_len = (int)Py_SIZE(code);
209
    for (int i = 0 ; i < code_len;) {
210
        if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) {
211
            int oparg = _PyCode_CODE(code)[i].op.arg;
212
            _PyExecutorObject *res = code->co_executors->executors[oparg];
213
            Py_INCREF(res);
214
            return res;
215
        }
216
        i += _PyInstruction_GetLength(code, i);
217
    }
218
    PyErr_SetString(PyExc_ValueError, "no executor at given byte offset");
219
    return NULL;
220
}
221
222
_PyExecutorObject *
223
_Py_GetExecutor(PyCodeObject *code, int offset)
224
{
225
    _PyExecutorObject *executor;
226
    Py_BEGIN_CRITICAL_SECTION(code);
227
    executor = get_executor_lock_held(code, offset);
228
    Py_END_CRITICAL_SECTION();
229
    return executor;
230
}
231
232
static PyObject *
233
is_valid(PyObject *self, PyObject *Py_UNUSED(ignored))
234
{
235
    return PyBool_FromLong(((_PyExecutorObject *)self)->vm_data.valid);
236
}
237
238
static PyObject *
239
get_opcode(PyObject *self, PyObject *Py_UNUSED(ignored))
240
{
241
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.opcode);
242
}
243
244
static PyObject *
245
get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
246
{
247
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.oparg);
248
}
249
250
///////////////////// Experimental UOp Optimizer /////////////////////
251
252
static int executor_clear(PyObject *executor);
253
254
void
255
_PyExecutor_Free(_PyExecutorObject *self)
256
{
257
#ifdef _Py_JIT
258
    _PyJIT_Free(self);
259
#endif
260
    PyObject_GC_Del(self);
261
}
262
263
static void executor_invalidate(PyObject *op);
264
265
static void
266
executor_clear_exits(_PyExecutorObject *executor)
267
{
268
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
269
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
270
    for (uint32_t i = 0; i < executor->exit_count; i++) {
271
        _PyExitData *exit = &executor->exits[i];
272
        exit->temperature = initial_unreachable_backoff_counter();
273
        _PyExecutorObject *old = executor->exits[i].executor;
274
        exit->executor = exit->is_dynamic ? cold_dynamic : cold;
275
        Py_DECREF(old);
276
    }
277
}
278
279
280
void
281
_Py_ClearExecutorDeletionList(PyInterpreterState *interp)
282
{
283
    if (interp->executor_deletion_list_head == NULL) {
284
        return;
285
    }
286
    _PyRuntimeState *runtime = &_PyRuntime;
287
    HEAD_LOCK(runtime);
288
    PyThreadState* ts = PyInterpreterState_ThreadHead(interp);
289
    while (ts) {
290
        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
291
        Py_XINCREF(current);
292
        ts = ts->next;
293
    }
294
    HEAD_UNLOCK(runtime);
295
    _PyExecutorObject *keep_list = NULL;
296
    do {
297
        _PyExecutorObject *exec = interp->executor_deletion_list_head;
298
        interp->executor_deletion_list_head = exec->vm_data.links.next;
299
        if (Py_REFCNT(exec) == 0) {
300
            _PyExecutor_Free(exec);
301
        } else {
302
            exec->vm_data.links.next = keep_list;
303
            keep_list = exec;
304
        }
305
    } while (interp->executor_deletion_list_head != NULL);
306
    interp->executor_deletion_list_head = keep_list;
307
    HEAD_LOCK(runtime);
308
    ts = PyInterpreterState_ThreadHead(interp);
309
    while (ts) {
310
        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
311
        if (current != NULL) {
312
            Py_DECREF((PyObject *)current);
313
        }
314
        ts = ts->next;
315
    }
316
    HEAD_UNLOCK(runtime);
317
}
318
319
static void
320
add_to_pending_deletion_list(_PyExecutorObject *self)
321
{
322
    if (self->vm_data.pending_deletion) {
323
        return;
324
    }
325
    self->vm_data.pending_deletion = 1;
326
    PyInterpreterState *interp = PyInterpreterState_Get();
327
    self->vm_data.links.previous = NULL;
328
    self->vm_data.links.next = interp->executor_deletion_list_head;
329
    interp->executor_deletion_list_head = self;
330
}
331
332
static void
333
uop_dealloc(PyObject *op) {
334
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
335
    executor_invalidate(op);
336
    assert(self->vm_data.code == NULL);
337
    add_to_pending_deletion_list(self);
338
}
339
340
const char *
341
_PyUOpName(int index)
342
{
343
    if (index < 0 || index > MAX_UOP_REGS_ID) {
344
        return NULL;
345
    }
346
    return _PyOpcode_uop_name[index];
347
}
348
349
#ifdef Py_DEBUG
350
void
351
_PyUOpPrint(const _PyUOpInstruction *uop)
352
{
353
    const char *name = _PyUOpName(uop->opcode);
354
    if (name == NULL) {
355
        printf("<uop %d>", uop->opcode);
356
    }
357
    else {
358
        printf("%s", name);
359
    }
360
    switch(uop->format) {
361
        case UOP_FORMAT_TARGET:
362
            printf(" (%d, target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
363
                uop->oparg,
364
                uop->target,
365
                (uint64_t)uop->operand0,
366
                (uint64_t)uop->operand1);
367
            break;
368
        case UOP_FORMAT_JUMP:
369
            printf(" (%d, jump_target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
370
                uop->oparg,
371
                uop->jump_target,
372
                (uint64_t)uop->operand0,
373
                (uint64_t)uop->operand1);
374
            break;
375
        default:
376
            printf(" (%d, Unknown format)", uop->oparg);
377
    }
378
    if (_PyUop_Flags[_PyUop_Uncached[uop->opcode]] & HAS_ERROR_FLAG) {
379
        printf(", error_target=%d", uop->error_target);
380
    }
381
382
    printf(")");
383
}
384
#endif
385
386
static Py_ssize_t
387
uop_len(PyObject *op)
388
{
389
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
390
    return self->code_size;
391
}
392
393
static PyObject *
394
uop_item(PyObject *op, Py_ssize_t index)
395
{
396
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
397
    Py_ssize_t len = uop_len(op);
398
    if (index < 0 || index >= len) {
399
        PyErr_SetNone(PyExc_IndexError);
400
        return NULL;
401
    }
402
    int opcode = self->trace[index].opcode;
403
    int base_opcode = _PyUop_Uncached[opcode];
404
    const char *name = _PyUOpName(base_opcode);
405
    if (name == NULL) {
406
        name = "<nil>";
407
    }
408
    PyObject *oname = _PyUnicode_FromASCII(name, strlen(name));
409
    if (oname == NULL) {
410
        return NULL;
411
    }
412
    PyObject *oparg = PyLong_FromUnsignedLong(self->trace[index].oparg);
413
    if (oparg == NULL) {
414
        Py_DECREF(oname);
415
        return NULL;
416
    }
417
    PyObject *target = PyLong_FromUnsignedLong(self->trace[index].target);
418
    if (target == NULL) {
419
        Py_DECREF(oparg);
420
        Py_DECREF(oname);
421
        return NULL;
422
    }
423
    PyObject *operand = PyLong_FromUnsignedLongLong(self->trace[index].operand0);
424
    if (operand == NULL) {
425
        Py_DECREF(target);
426
        Py_DECREF(oparg);
427
        Py_DECREF(oname);
428
        return NULL;
429
    }
430
    PyObject *args[4] = { oname, oparg, target, operand };
431
    return _PyTuple_FromArraySteal(args, 4);
432
}
433
434
PySequenceMethods uop_as_sequence = {
435
    .sq_length = uop_len,
436
    .sq_item = uop_item,
437
};
438
439
static int
440
executor_traverse(PyObject *o, visitproc visit, void *arg)
441
{
442
    _PyExecutorObject *executor = _PyExecutorObject_CAST(o);
443
    for (uint32_t i = 0; i < executor->exit_count; i++) {
444
        Py_VISIT(executor->exits[i].executor);
445
    }
446
    return 0;
447
}
448
449
static PyObject *
450
get_jit_code(PyObject *self, PyObject *Py_UNUSED(ignored))
451
{
452
#ifndef _Py_JIT
453
    PyErr_SetString(PyExc_RuntimeError, "JIT support not enabled.");
454
    return NULL;
455
#else
456
    _PyExecutorObject *executor = _PyExecutorObject_CAST(self);
457
    if (executor->jit_code == NULL || executor->jit_size == 0) {
458
        Py_RETURN_NONE;
459
    }
460
    return PyBytes_FromStringAndSize(executor->jit_code, executor->jit_size);
461
#endif
462
}
463
464
static PyMethodDef uop_executor_methods[] = {
465
    { "is_valid", is_valid, METH_NOARGS, NULL },
466
    { "get_jit_code", get_jit_code, METH_NOARGS, NULL},
467
    { "get_opcode", get_opcode, METH_NOARGS, NULL },
468
    { "get_oparg", get_oparg, METH_NOARGS, NULL },
469
    { NULL, NULL },
470
};
471
472
static int
473
executor_is_gc(PyObject *o)
474
{
475
    return !_Py_IsImmortal(o);
476
}
477
478
PyTypeObject _PyUOpExecutor_Type = {
479
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
480
    .tp_name = "uop_executor",
481
    .tp_basicsize = offsetof(_PyExecutorObject, exits),
482
    .tp_itemsize = 1,
483
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
484
    .tp_dealloc = uop_dealloc,
485
    .tp_as_sequence = &uop_as_sequence,
486
    .tp_methods = uop_executor_methods,
487
    .tp_traverse = executor_traverse,
488
    .tp_clear = executor_clear,
489
    .tp_is_gc = executor_is_gc,
490
};
491
492
/* TO DO -- Generate these tables */
493
static const uint16_t
494
_PyUOp_Replacements[MAX_UOP_ID + 1] = {
495
    [_ITER_JUMP_RANGE] = _GUARD_NOT_EXHAUSTED_RANGE,
496
    [_ITER_JUMP_LIST] = _GUARD_NOT_EXHAUSTED_LIST,
497
    [_ITER_JUMP_TUPLE] = _GUARD_NOT_EXHAUSTED_TUPLE,
498
    [_FOR_ITER] = _FOR_ITER_TIER_TWO,
499
    [_ITER_NEXT_LIST] = _ITER_NEXT_LIST_TIER_TWO,
500
    [_CHECK_PERIODIC_AT_END] = _TIER2_RESUME_CHECK,
501
};
502
503
static const uint8_t
504
is_for_iter_test[MAX_UOP_ID + 1] = {
505
    [_GUARD_NOT_EXHAUSTED_RANGE] = 1,
506
    [_GUARD_NOT_EXHAUSTED_LIST] = 1,
507
    [_GUARD_NOT_EXHAUSTED_TUPLE] = 1,
508
    [_FOR_ITER_TIER_TWO] = 1,
509
};
510
511
static const uint16_t
512
BRANCH_TO_GUARD[4][2] = {
513
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_TRUE_POP,
514
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_FALSE_POP,
515
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_FALSE_POP,
516
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_TRUE_POP,
517
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NOT_NONE_POP,
518
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NONE_POP,
519
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NONE_POP,
520
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
521
};
522
523
static const uint16_t
524
guard_ip_uop[MAX_UOP_ID + 1] = {
525
    [_PUSH_FRAME] = _GUARD_IP__PUSH_FRAME,
526
    [_RETURN_GENERATOR] = _GUARD_IP_RETURN_GENERATOR,
527
    [_RETURN_VALUE] = _GUARD_IP_RETURN_VALUE,
528
    [_YIELD_VALUE] = _GUARD_IP_YIELD_VALUE,
529
};
530
531
532
#define CONFIDENCE_RANGE 1000
533
#define CONFIDENCE_CUTOFF 333
534
535
#ifdef Py_DEBUG
536
#define DPRINTF(level, ...) \
537
    if (lltrace >= (level)) { printf(__VA_ARGS__); }
538
#else
539
#define DPRINTF(level, ...)
540
#endif
541
542
543
static inline void
544
add_to_trace(
545
    _PyJitUopBuffer *trace,
546
    uint16_t opcode,
547
    uint16_t oparg,
548
    uint64_t operand,
549
    uint32_t target)
550
{
551
    _PyUOpInstruction *inst = trace->next;
552
    inst->opcode = opcode;
553
    inst->format = UOP_FORMAT_TARGET;
554
    inst->target = target;
555
    inst->oparg = oparg;
556
    inst->operand0 = operand;
557
#ifdef Py_STATS
558
    inst->execution_count = 0;
559
#endif
560
    trace->next++;
561
}
562
563
564
#ifdef Py_DEBUG
565
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
566
    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
567
    if (lltrace >= 2) { \
568
        printf("%4d ADD_TO_TRACE: ", uop_buffer_length(trace)); \
569
        _PyUOpPrint(uop_buffer_last(trace)); \
570
        printf("\n"); \
571
    }
572
#else
573
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
574
    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET))
575
#endif
576
577
#define INSTR_IP(INSTR, CODE) \
578
    ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
579
580
581
static int
582
is_terminator(const _PyUOpInstruction *uop)
583
{
584
    int opcode = _PyUop_Uncached[uop->opcode];
585
    return (
586
        opcode == _EXIT_TRACE ||
587
        opcode == _DEOPT ||
588
        opcode == _JUMP_TO_TOP ||
589
        opcode == _DYNAMIC_EXIT
590
    );
591
}
592
593
/* Returns 1 on success (added to trace), 0 on trace end.
594
 */
595
// gh-142543: inlining this function causes stack overflows
596
Py_NO_INLINE int
597
_PyJit_translate_single_bytecode_to_trace(
598
    PyThreadState *tstate,
599
    _PyInterpreterFrame *frame,
600
    _Py_CODEUNIT *next_instr,
601
    int stop_tracing_opcode)
602
{
603
604
#ifdef Py_DEBUG
605
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
606
    int lltrace = 0;
607
    if (python_lltrace != NULL && *python_lltrace >= '0') {
608
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
609
    }
610
#endif
611
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
612
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
613
    PyCodeObject *old_code = tracer->prev_state.instr_code;
614
    bool progress_needed = (tracer->initial_state.chain_depth % MAX_CHAIN_DEPTH) == 0;
615
    _PyJitUopBuffer *trace = &tracer->code_buffer;
616
617
    _Py_CODEUNIT *this_instr =  tracer->prev_state.instr;
618
    _Py_CODEUNIT *target_instr = this_instr;
619
    uint32_t target = 0;
620
621
    target = Py_IsNone((PyObject *)old_code)
622
        ? (uint32_t)(target_instr - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR)
623
        : INSTR_IP(target_instr, old_code);
624
625
    // Rewind EXTENDED_ARG so that we see the whole thing.
626
    // We must point to the first EXTENDED_ARG when deopting.
627
    int oparg = tracer->prev_state.instr_oparg;
628
    int opcode = this_instr->op.code;
629
    int rewind_oparg = oparg;
630
    while (rewind_oparg > 255) {
631
        rewind_oparg >>= 8;
632
        target--;
633
    }
634
635
    if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
636
        uint16_t backoff = (this_instr + 1)->counter.value_and_backoff;
637
        // adaptive_counter_cooldown is a fresh specialization.
638
        // trigger_backoff_counter is what we set during tracing.
639
        // All tracing backoffs should be freshly specialized or untouched.
640
        // If not, that indicates a deopt during tracing, and
641
        // thus the "actual" instruction executed is not the one that is
642
        // in the instruction stream, but rather the deopt.
643
        // It's important we check for this, as some specializations might make
644
        // no progress (they can immediately deopt after specializing).
645
        // We do this to improve performance, as otherwise a compiled trace
646
        // will just deopt immediately.
647
        if (backoff != adaptive_counter_cooldown().value_and_backoff &&
648
            backoff != trigger_backoff_counter().value_and_backoff) {
649
            OPT_STAT_INC(trace_immediately_deopts);
650
            opcode = _PyOpcode_Deopt[opcode];
651
        }
652
    }
653
654
    // Strange control-flow
655
    bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) &&
656
        (next_instr != this_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]]);
657
658
    /* Special case the first instruction,
659
    * so that we can guarantee forward progress */
660
    if (progress_needed && uop_buffer_length(&tracer->code_buffer) < CODE_SIZE_NO_PROGRESS) {
661
        if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
662
            opcode = _PyOpcode_Deopt[opcode];
663
        }
664
        assert(!OPCODE_HAS_EXIT(opcode));
665
        assert(!OPCODE_HAS_DEOPT(opcode));
666
    }
667
668
    bool needs_guard_ip = OPCODE_HAS_NEEDS_GUARD_IP(opcode);
669
    if (has_dynamic_jump_taken && !needs_guard_ip) {
670
        DPRINTF(2, "Unsupported: dynamic jump taken %s\n", _PyOpcode_OpName[opcode]);
671
        goto unsupported;
672
    }
673
674
    int is_sys_tracing = (tstate->c_tracefunc != NULL) || (tstate->c_profilefunc != NULL);
675
    if (is_sys_tracing) {
676
        goto done;
677
    }
678
679
    if (stop_tracing_opcode == _DEOPT) {
680
        // gh-143183: It's important we rewind to the last known proper target.
681
        // The current target might be garbage as stop tracing usually indicates
682
        // we are in something that we can't trace.
683
        DPRINTF(2, "Told to stop tracing\n");
684
        goto unsupported;
685
    }
686
    else if (stop_tracing_opcode != 0) {
687
        assert(stop_tracing_opcode == _EXIT_TRACE);
688
        ADD_TO_TRACE(stop_tracing_opcode, 0, 0, target);
689
        goto done;
690
    }
691
692
    DPRINTF(2, "%p %d: %s(%d) %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip);
693
694
#ifdef Py_DEBUG
695
    if (oparg > 255) {
696
        assert(_Py_GetBaseCodeUnit(old_code, target).op.code == EXTENDED_ARG);
697
    }
698
#endif
699
700
    // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls
701
    // If we haven't guarded the IP, then it's untraceable.
702
    if (frame != tracer->prev_state.instr_frame && !needs_guard_ip) {
703
        DPRINTF(2, "Unsupported: unguardable jump taken\n");
704
        goto unsupported;
705
    }
706
707
    if (oparg > 0xFFFF) {
708
        DPRINTF(2, "Unsupported: oparg too large\n");
709
        unsupported:
710
        {
711
            // Rewind to previous instruction and replace with _EXIT_TRACE.
712
            _PyUOpInstruction *curr = uop_buffer_last(trace);
713
            while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
714
                trace->next--;
715
                curr = uop_buffer_last(trace);
716
            }
717
            assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
718
            if (curr->opcode == _SET_IP) {
719
                int32_t old_target = (int32_t)uop_get_target(curr);
720
                curr->opcode = _DEOPT;
721
                curr->format = UOP_FORMAT_TARGET;
722
                curr->target = old_target;
723
            }
724
            goto done;
725
        }
726
    }
727
728
    if (opcode == NOP) {
729
        return 1;
730
    }
731
732
    if (opcode == JUMP_FORWARD) {
733
        return 1;
734
    }
735
736
    if (opcode == EXTENDED_ARG) {
737
        return 1;
738
    }
739
740
    // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
741
    trace->end -= 2;
742
743
    const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
744
745
    assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
746
    assert(!_PyErr_Occurred(tstate));
747
748
749
    if (OPCODE_HAS_EXIT(opcode)) {
750
        // Make space for side exit
751
        trace->end--;
752
    }
753
    if (OPCODE_HAS_ERROR(opcode)) {
754
        // Make space for error stub
755
        trace->end--;
756
    }
757
    if (OPCODE_HAS_DEOPT(opcode)) {
758
        // Make space for side exit
759
        trace->end--;
760
    }
761
762
    // _GUARD_IP leads to an exit.
763
    trace->end -= needs_guard_ip;
764
765
    int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode));
766
    if (uop_buffer_remaining_space(trace) < space_needed) {
767
        DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
768
                space_needed, uop_buffer_remaining_space(trace));
769
        OPT_STAT_INC(trace_too_long);
770
        goto done;
771
    }
772
773
    ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
774
775
    if (!OPCODE_HAS_NO_SAVE_IP(opcode)) {
776
        ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)target_instr, target);
777
    }
778
779
    switch (opcode) {
780
        case POP_JUMP_IF_NONE:
781
        case POP_JUMP_IF_NOT_NONE:
782
        case POP_JUMP_IF_FALSE:
783
        case POP_JUMP_IF_TRUE:
784
        {
785
            _Py_CODEUNIT *computed_next_instr_without_modifiers = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
786
            _Py_CODEUNIT *computed_next_instr = computed_next_instr_without_modifiers + (computed_next_instr_without_modifiers->op.code == NOT_TAKEN);
787
            _Py_CODEUNIT *computed_jump_instr = computed_next_instr_without_modifiers + oparg;
788
            assert(next_instr == computed_next_instr || next_instr == computed_jump_instr);
789
            int jump_happened = computed_jump_instr == next_instr;
790
            assert(jump_happened == (target_instr[1].cache & 1));
791
            uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
792
            ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
793
            break;
794
        }
795
        case JUMP_BACKWARD_JIT:
796
            // This is possible as the JIT might have re-activated after it was disabled
797
        case JUMP_BACKWARD_NO_JIT:
798
        case JUMP_BACKWARD:
799
            ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
800
            _Py_FALLTHROUGH;
801
        case JUMP_BACKWARD_NO_INTERRUPT:
802
        {
803
            if ((next_instr != tracer->initial_state.close_loop_instr) &&
804
                (next_instr != tracer->initial_state.start_instr) &&
805
                uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS &&
806
                // For side exits, we don't want to terminate them early.
807
                tracer->initial_state.exit == NULL &&
808
                // These are coroutines, and we want to unroll those usually.
809
                opcode != JUMP_BACKWARD_NO_INTERRUPT) {
810
                // We encountered a JUMP_BACKWARD but not to the top of our own loop.
811
                // We don't want to continue tracing as we might get stuck in the
812
                // inner loop. Instead, end the trace where the executor of the
813
                // inner loop might start and let the traces rejoin.
814
                OPT_STAT_INC(inner_loop);
815
                ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
816
                uop_buffer_last(trace)->operand1 = true; // is_control_flow
817
                DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
818
                    tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr);
819
                goto done;
820
            }
821
            break;
822
        }
823
824
        case RESUME:
825
        case RESUME_CHECK:
826
            /* Use a special tier 2 version of RESUME_CHECK to allow traces to
827
             *  start with RESUME_CHECK */
828
            ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
829
            break;
830
        default:
831
        {
832
            const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
833
            // Reserve space for nuops (+ _SET_IP + _EXIT_TRACE)
834
            int nuops = expansion->nuops;
835
            if (nuops == 0) {
836
                DPRINTF(2, "Unsupported opcode %s\n", _PyOpcode_OpName[opcode]);
837
                goto unsupported;
838
            }
839
            assert(nuops > 0);
840
            uint32_t orig_oparg = oparg;  // For OPARG_TOP/BOTTOM
841
            uint32_t orig_target = target;
842
            for (int i = 0; i < nuops; i++) {
843
                oparg = orig_oparg;
844
                target = orig_target;
845
                uint32_t uop = expansion->uops[i].uop;
846
                uint64_t operand = 0;
847
                // Add one to account for the actual opcode/oparg pair:
848
                int offset = expansion->uops[i].offset + 1;
849
                switch (expansion->uops[i].size) {
850
                    case OPARG_SIMPLE:
851
                        assert(opcode != _JUMP_BACKWARD_NO_INTERRUPT && opcode != JUMP_BACKWARD);
852
                        break;
853
                    case OPARG_CACHE_1:
854
                        operand = read_u16(&this_instr[offset].cache);
855
                        break;
856
                    case OPARG_CACHE_2:
857
                        operand = read_u32(&this_instr[offset].cache);
858
                        break;
859
                    case OPARG_CACHE_4:
860
                        operand = read_u64(&this_instr[offset].cache);
861
                        break;
862
                    case OPARG_TOP:  // First half of super-instr
863
                        assert(orig_oparg <= 255);
864
                        oparg = orig_oparg >> 4;
865
                        break;
866
                    case OPARG_BOTTOM:  // Second half of super-instr
867
                        assert(orig_oparg <= 255);
868
                        oparg = orig_oparg & 0xF;
869
                        break;
870
                    case OPARG_SAVE_RETURN_OFFSET:  // op=_SAVE_RETURN_OFFSET; oparg=return_offset
871
                        oparg = offset;
872
                        assert(uop == _SAVE_RETURN_OFFSET);
873
                        break;
874
                    case OPARG_REPLACED:
875
                        uop = _PyUOp_Replacements[uop];
876
                        assert(uop != 0);
877
878
                        uint32_t next_inst = target + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
879
                        if (uop == _TIER2_RESUME_CHECK) {
880
                            target = next_inst;
881
                        }
882
                        else {
883
                            int extended_arg = orig_oparg > 255;
884
                            uint32_t jump_target = next_inst + orig_oparg + extended_arg;
885
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target).op.code == END_FOR);
886
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target+1).op.code == POP_ITER);
887
                            if (is_for_iter_test[uop]) {
888
                                target = jump_target + 1;
889
                            }
890
                        }
891
                        break;
892
                    case OPERAND1_1:
893
                        assert(uop_buffer_last(trace)->opcode == uop);
894
                        operand = read_u16(&this_instr[offset].cache);
895
                        uop_buffer_last(trace)->operand1 = operand;
896
                        continue;
897
                    case OPERAND1_2:
898
                        assert(uop_buffer_last(trace)->opcode == uop);
899
                        operand = read_u32(&this_instr[offset].cache);
900
                        uop_buffer_last(trace)->operand1 = operand;
901
                        continue;
902
                    case OPERAND1_4:
903
                        assert(uop_buffer_last(trace)->opcode == uop);
904
                        operand = read_u64(&this_instr[offset].cache);
905
                        uop_buffer_last(trace)->operand1 = operand;
906
                        continue;
907
                    default:
908
                        fprintf(stderr,
909
                                "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n",
910
                                opcode, oparg, nuops, i,
911
                                expansion->uops[i].size,
912
                                expansion->uops[i].offset);
913
                        Py_FatalError("garbled expansion");
914
                }
915
                if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
916
                    assert(i + 1 == nuops);
917
                    _Py_CODEUNIT *next = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
918
                    assert(next->op.code == STORE_FAST);
919
                    operand = next->op.arg;
920
                }
921
                else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
922
                    PyObject *recorded_value = tracer->prev_state.recorded_value;
923
                    tracer->prev_state.recorded_value = NULL;
924
                    operand = (uintptr_t)recorded_value;
925
                }
926
                // All other instructions
927
                ADD_TO_TRACE(uop, oparg, operand, target);
928
            }
929
            break;
930
        }  // End default
931
932
    }  // End switch (opcode)
933
934
    if (needs_guard_ip) {
935
        uint16_t guard_ip = guard_ip_uop[uop_buffer_last(trace)->opcode];
936
        if (guard_ip == 0) {
937
            DPRINTF(1, "Unknown uop needing guard ip %s\n", _PyOpcode_uop_name[uop_buffer_last(trace)->opcode]);
938
            Py_UNREACHABLE();
939
        }
940
        PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable);
941
        Py_INCREF(code);
942
        ADD_TO_TRACE(_RECORD_CODE, 0, (uintptr_t)code, 0);
943
        ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0);
944
        if (PyCode_Check(code)) {
945
            /* Record stack depth, in operand1 */
946
            int stack_depth = (int)(frame->stackpointer - _PyFrame_Stackbase(frame));
947
            uop_buffer_last(trace)->operand1 = stack_depth;
948
            ADD_TO_TRACE(_GUARD_CODE_VERSION, 0, ((PyCodeObject *)code)->co_version, 0);
949
        }
950
    }
951
    // Loop back to the start
952
    int is_first_instr = tracer->initial_state.close_loop_instr == next_instr ||
953
        tracer->initial_state.start_instr == next_instr;
954
    if (is_first_instr && uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
955
        if (needs_guard_ip) {
956
            ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0);
957
        }
958
        ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
959
        goto done;
960
    }
961
    DPRINTF(2, "Trace continuing\n");
962
    return 1;
963
done:
964
    DPRINTF(2, "Trace done\n");
965
    if (!is_terminator(uop_buffer_last(trace))) {
966
        ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
967
        uop_buffer_last(trace)->operand1 = true; // is_control_flow
968
    }
969
    return 0;
970
}
971
972
// Returns 0 for do not enter tracing, 1 on enter tracing.
973
// gh-142543: inlining this function causes stack overflows
974
Py_NO_INLINE int
975
_PyJit_TryInitializeTracing(
976
    PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *curr_instr,
977
    _Py_CODEUNIT *start_instr, _Py_CODEUNIT *close_loop_instr, _PyStackRef *stack_pointer, int chain_depth,
978
    _PyExitData *exit, int oparg, _PyExecutorObject *current_executor)
979
{
980
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
981
    if (_tstate->jit_tracer_state == NULL) {
982
        _tstate->jit_tracer_state = (_PyJitTracerState *)_PyObject_VirtualAlloc(sizeof(_PyJitTracerState));
983
        if (_tstate->jit_tracer_state == NULL) {
984
            // Don't error, just go to next instruction.
985
            return 0;
986
        }
987
        _tstate->jit_tracer_state->is_tracing = false;
988
    }
989
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
990
    // A recursive trace.
991
    if (tracer->is_tracing) {
992
        return 0;
993
    }
994
    if (oparg > 0xFFFF) {
995
        return 0;
996
    }
997
    PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj);
998
    if (func == NULL || !PyFunction_Check(func)) {
999
        return 0;
1000
    }
1001
    PyCodeObject *code = _PyFrame_GetCode(frame);
1002
#ifdef Py_DEBUG
1003
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1004
    int lltrace = 0;
1005
    if (python_lltrace != NULL && *python_lltrace >= '0') {
1006
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
1007
    }
1008
    DPRINTF(2,
1009
        "Tracing %s (%s:%d) at byte offset %d at chain depth %d\n",
1010
        PyUnicode_AsUTF8(code->co_qualname),
1011
        PyUnicode_AsUTF8(code->co_filename),
1012
        code->co_firstlineno,
1013
        2 * INSTR_IP(close_loop_instr, code),
1014
        chain_depth);
1015
#endif
1016
    /* Set up tracing buffer*/
1017
    _PyJitUopBuffer *trace = &tracer->code_buffer;
1018
    uop_buffer_init(trace, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
1019
    ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)start_instr, INSTR_IP(start_instr, code));
1020
    ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
1021
1022
    tracer->initial_state.start_instr = start_instr;
1023
    tracer->initial_state.close_loop_instr = close_loop_instr;
1024
    tracer->initial_state.code = (PyCodeObject *)Py_NewRef(code);
1025
    tracer->initial_state.func = (PyFunctionObject *)Py_NewRef(func);
1026
    tracer->initial_state.executor = (_PyExecutorObject *)Py_XNewRef(current_executor);
1027
    tracer->initial_state.exit = exit;
1028
    tracer->initial_state.stack_depth = (int)(stack_pointer - _PyFrame_Stackbase(frame));
1029
    tracer->initial_state.chain_depth = chain_depth;
1030
    tracer->prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame));
1031
    tracer->prev_state.instr = curr_instr;
1032
    tracer->prev_state.instr_frame = frame;
1033
    tracer->prev_state.instr_oparg = oparg;
1034
    tracer->prev_state.instr_stacklevel = tracer->initial_state.stack_depth;
1035
    tracer->prev_state.recorded_value = NULL;
1036
    uint8_t record_func_index = _PyOpcode_RecordFunctionIndices[curr_instr->op.code];
1037
    if (record_func_index) {
1038
        _Py_RecordFuncPtr record_func = _PyOpcode_RecordFunctions[record_func_index];
1039
        record_func(frame, stack_pointer, oparg, &tracer->prev_state.recorded_value);
1040
    }
1041
    assert(curr_instr->op.code == JUMP_BACKWARD_JIT || (exit != NULL));
1042
    tracer->initial_state.jump_backward_instr = curr_instr;
1043
1044
    if (_PyOpcode_Caches[_PyOpcode_Deopt[close_loop_instr->op.code]]) {
1045
        close_loop_instr[1].counter = trigger_backoff_counter();
1046
    }
1047
    tracer->is_tracing = true;
1048
    return 1;
1049
}
1050
1051
Py_NO_INLINE void
1052
_PyJit_FinalizeTracing(PyThreadState *tstate, int err)
1053
{
1054
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1055
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
1056
    // Deal with backoffs
1057
    assert(tracer != NULL);
1058
    _PyExitData *exit = tracer->initial_state.exit;
1059
    if (exit == NULL) {
1060
        // We hold a strong reference to the code object, so the instruction won't be freed.
1061
        if (err <= 0) {
1062
            _Py_BackoffCounter counter = tracer->initial_state.jump_backward_instr[1].counter;
1063
            tracer->initial_state.jump_backward_instr[1].counter = restart_backoff_counter(counter);
1064
        }
1065
        else {
1066
            tracer->initial_state.jump_backward_instr[1].counter = initial_jump_backoff_counter(&tstate->interp->opt_config);
1067
        }
1068
    }
1069
    else if (tracer->initial_state.executor->vm_data.valid) {
1070
        // Likewise, we hold a strong reference to the executor containing this exit, so the exit is guaranteed
1071
        // to be valid to access.
1072
        if (err <= 0) {
1073
            exit->temperature = restart_backoff_counter(exit->temperature);
1074
        }
1075
        else {
1076
            exit->temperature = initial_temperature_backoff_counter(&tstate->interp->opt_config);
1077
        }
1078
    }
1079
    Py_CLEAR(tracer->initial_state.code);
1080
    Py_CLEAR(tracer->initial_state.func);
1081
    Py_CLEAR(tracer->initial_state.executor);
1082
    Py_CLEAR(tracer->prev_state.instr_code);
1083
    Py_CLEAR(tracer->prev_state.recorded_value);
1084
    uop_buffer_init(&tracer->code_buffer, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
1085
    tracer->is_tracing = false;
1086
}
1087
1088
void
1089
_PyJit_TracerFree(_PyThreadStateImpl *_tstate)
1090
{
1091
    if (_tstate->jit_tracer_state != NULL) {
1092
        _PyObject_VirtualFree(_tstate->jit_tracer_state, sizeof(_PyJitTracerState));
1093
        _tstate->jit_tracer_state = NULL;
1094
    }
1095
}
1096
1097
#undef RESERVE
1098
#undef INSTR_IP
1099
#undef ADD_TO_TRACE
1100
#undef DPRINTF
1101
1102
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
1103
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
1104
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
1105
1106
/* Count the number of unused uops and exits
1107
*/
1108
static int
1109
count_exits(_PyUOpInstruction *buffer, int length)
1110
{
1111
    int exit_count = 0;
1112
    for (int i = 0; i < length; i++) {
1113
        uint16_t base_opcode = _PyUop_Uncached[buffer[i].opcode];
1114
        if (base_opcode == _EXIT_TRACE || base_opcode == _DYNAMIC_EXIT) {
1115
            exit_count++;
1116
        }
1117
    }
1118
    return exit_count;
1119
}
1120
1121
/* The number of cached registers at any exit (`EXIT_IF` or `DEOPT_IF`)
1122
 * This is the number of cached at entries at start, unless the uop is
1123
 * marked as `exit_depth_is_output` in which case it is the number of
1124
 * cached entries at the end */
1125
static int
1126
get_cached_entries_for_side_exit(_PyUOpInstruction *inst)
1127
{
1128
    // Maybe add another generated table for this?
1129
    int base_opcode = _PyUop_Uncached[inst->opcode];
1130
    assert(base_opcode != 0);
1131
    for (int i = 0; i <= MAX_CACHED_REGISTER; i++) {
1132
        const _PyUopTOSentry *entry = &_PyUop_Caching[base_opcode].entries[i];
1133
        if (entry->opcode == inst->opcode) {
1134
            return entry->exit;
1135
        }
1136
    }
1137
    Py_UNREACHABLE();
1138
}
1139
1140
static void make_exit(_PyUOpInstruction *inst, int opcode, int target, bool is_control_flow)
1141
{
1142
    assert(opcode > MAX_UOP_ID && opcode <= MAX_UOP_REGS_ID);
1143
    inst->opcode = opcode;
1144
    inst->oparg = 0;
1145
    inst->operand0 = 0;
1146
    inst->format = UOP_FORMAT_TARGET;
1147
    inst->target = target;
1148
    inst->operand1 = is_control_flow;
1149
#ifdef Py_STATS
1150
    inst->execution_count = 0;
1151
#endif
1152
}
1153
1154
/* Convert implicit exits, errors and deopts
1155
 * into explicit ones. */
1156
static int
1157
prepare_for_execution(_PyUOpInstruction *buffer, int length)
1158
{
1159
    int32_t current_jump = -1;
1160
    int32_t current_jump_target = -1;
1161
    int32_t current_error = -1;
1162
    int32_t current_error_target = -1;
1163
    int32_t current_popped = -1;
1164
    int32_t current_exit_op = -1;
1165
    /* Leaving in NOPs slows down the interpreter and messes up the stats */
1166
    _PyUOpInstruction *copy_to = &buffer[0];
1167
    for (int i = 0; i < length; i++) {
1168
        _PyUOpInstruction *inst = &buffer[i];
1169
        if (inst->opcode != _NOP) {
1170
            if (copy_to != inst) {
1171
                *copy_to = *inst;
1172
            }
1173
            copy_to++;
1174
        }
1175
    }
1176
    length = (int)(copy_to - buffer);
1177
    int next_spare = length;
1178
    for (int i = 0; i < length; i++) {
1179
        _PyUOpInstruction *inst = &buffer[i];
1180
        int base_opcode = _PyUop_Uncached[inst->opcode];
1181
        assert(inst->opcode != _NOP);
1182
        int32_t target = (int32_t)uop_get_target(inst);
1183
        uint16_t exit_flags = _PyUop_Flags[base_opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG | HAS_PERIODIC_FLAG);
1184
        if (exit_flags) {
1185
            uint16_t base_exit_op = _EXIT_TRACE;
1186
            if (exit_flags & HAS_DEOPT_FLAG) {
1187
                base_exit_op = _DEOPT;
1188
            }
1189
            else if (exit_flags & HAS_PERIODIC_FLAG) {
1190
                base_exit_op = _HANDLE_PENDING_AND_DEOPT;
1191
            }
1192
            int32_t jump_target = target;
1193
            if (
1194
                base_opcode == _GUARD_IP__PUSH_FRAME ||
1195
                base_opcode == _GUARD_IP_RETURN_VALUE ||
1196
                base_opcode == _GUARD_IP_YIELD_VALUE ||
1197
                base_opcode == _GUARD_IP_RETURN_GENERATOR ||
1198
                base_opcode == _GUARD_CODE_VERSION
1199
            ) {
1200
                base_exit_op = _DYNAMIC_EXIT;
1201
            }
1202
            int exit_depth = get_cached_entries_for_side_exit(inst);
1203
            assert(_PyUop_Caching[base_exit_op].entries[exit_depth].opcode > 0);
1204
            int16_t exit_op = _PyUop_Caching[base_exit_op].entries[exit_depth].opcode;
1205
            bool is_control_flow = (base_opcode == _GUARD_IS_FALSE_POP || base_opcode == _GUARD_IS_TRUE_POP || is_for_iter_test[base_opcode]);
1206
            if (jump_target != current_jump_target || current_exit_op != exit_op) {
1207
                make_exit(&buffer[next_spare], exit_op, jump_target, is_control_flow);
1208
                current_exit_op = exit_op;
1209
                current_jump_target = jump_target;
1210
                current_jump = next_spare;
1211
                next_spare++;
1212
            }
1213
            buffer[i].jump_target = current_jump;
1214
            buffer[i].format = UOP_FORMAT_JUMP;
1215
        }
1216
        if (_PyUop_Flags[base_opcode] & HAS_ERROR_FLAG) {
1217
            int popped = (_PyUop_Flags[base_opcode] & HAS_ERROR_NO_POP_FLAG) ?
1218
                0 : _PyUop_num_popped(base_opcode, inst->oparg);
1219
            if (target != current_error_target || popped != current_popped) {
1220
                current_popped = popped;
1221
                current_error = next_spare;
1222
                current_error_target = target;
1223
                make_exit(&buffer[next_spare], _ERROR_POP_N_r00, 0, false);
1224
                buffer[next_spare].operand0 = target;
1225
                next_spare++;
1226
            }
1227
            buffer[i].error_target = current_error;
1228
            if (buffer[i].format == UOP_FORMAT_TARGET) {
1229
                buffer[i].format = UOP_FORMAT_JUMP;
1230
                buffer[i].jump_target = 0;
1231
            }
1232
        }
1233
        if (base_opcode == _JUMP_TO_TOP) {
1234
            assert(_PyUop_Uncached[buffer[0].opcode] == _START_EXECUTOR);
1235
            buffer[i].format = UOP_FORMAT_JUMP;
1236
            buffer[i].jump_target = 1;
1237
        }
1238
    }
1239
    return next_spare;
1240
}
1241
1242
/* Executor side exits */
1243
1244
static _PyExecutorObject *
1245
allocate_executor(int exit_count, int length)
1246
{
1247
    int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
1248
    _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
1249
    if (res == NULL) {
1250
        return NULL;
1251
    }
1252
    res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
1253
    res->code_size = length;
1254
    res->exit_count = exit_count;
1255
    return res;
1256
}
1257
1258
#ifdef Py_DEBUG
1259
1260
#define CHECK(PRED) \
1261
if (!(PRED)) { \
1262
    printf(#PRED " at %d\n", i); \
1263
    assert(0); \
1264
}
1265
1266
static int
1267
target_unused(int opcode)
1268
{
1269
    return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0;
1270
}
1271
1272
static void
1273
sanity_check(_PyExecutorObject *executor)
1274
{
1275
    for (uint32_t i = 0; i < executor->exit_count; i++) {
1276
        _PyExitData *exit = &executor->exits[i];
1277
        CHECK(exit->target < (1 << 25));
1278
    }
1279
    bool ended = false;
1280
    uint32_t i = 0;
1281
    CHECK(_PyUop_Uncached[executor->trace[0].opcode] == _START_EXECUTOR ||
1282
        _PyUop_Uncached[executor->trace[0].opcode] == _COLD_EXIT ||
1283
        _PyUop_Uncached[executor->trace[0].opcode] == _COLD_DYNAMIC_EXIT);
1284
    for (; i < executor->code_size; i++) {
1285
        const _PyUOpInstruction *inst = &executor->trace[i];
1286
        uint16_t opcode = inst->opcode;
1287
        uint16_t base_opcode = _PyUop_Uncached[opcode];
1288
        CHECK(opcode > MAX_UOP_ID);
1289
        CHECK(opcode <= MAX_UOP_REGS_ID);
1290
        CHECK(base_opcode <= MAX_UOP_ID);
1291
        CHECK(base_opcode != 0);
1292
        switch(inst->format) {
1293
            case UOP_FORMAT_TARGET:
1294
                CHECK(target_unused(base_opcode));
1295
                break;
1296
            case UOP_FORMAT_JUMP:
1297
                CHECK(inst->jump_target < executor->code_size);
1298
                break;
1299
        }
1300
        if (_PyUop_Flags[base_opcode] & HAS_ERROR_FLAG) {
1301
            CHECK(inst->format == UOP_FORMAT_JUMP);
1302
            CHECK(inst->error_target < executor->code_size);
1303
        }
1304
        if (is_terminator(inst)) {
1305
            ended = true;
1306
            i++;
1307
            break;
1308
        }
1309
    }
1310
    CHECK(ended);
1311
    for (; i < executor->code_size; i++) {
1312
        const _PyUOpInstruction *inst = &executor->trace[i];
1313
        uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
1314
        CHECK(
1315
            base_opcode == _DEOPT ||
1316
            base_opcode == _HANDLE_PENDING_AND_DEOPT ||
1317
            base_opcode == _EXIT_TRACE ||
1318
            base_opcode == _ERROR_POP_N ||
1319
            base_opcode == _DYNAMIC_EXIT);
1320
    }
1321
}
1322
1323
#undef CHECK
1324
#endif
1325
1326
/* Makes an executor from a buffer of uops.
1327
 * Account for the buffer having gaps and NOPs by computing a "used"
1328
 * bit vector and only copying the used uops. Here "used" means reachable
1329
 * and not a NOP.
1330
 */
1331
static _PyExecutorObject *
1332
make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies)
1333
{
1334
    int exit_count = count_exits(buffer, length);
1335
    _PyExecutorObject *executor = allocate_executor(exit_count, length);
1336
    if (executor == NULL) {
1337
        return NULL;
1338
    }
1339
1340
    /* Initialize exits */
1341
    int chain_depth = tstate->jit_tracer_state->initial_state.chain_depth;
1342
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
1343
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
1344
    cold->vm_data.chain_depth = chain_depth;
1345
    PyInterpreterState *interp = tstate->base.interp;
1346
    for (int i = 0; i < exit_count; i++) {
1347
        executor->exits[i].index = i;
1348
        executor->exits[i].temperature = initial_temperature_backoff_counter(&interp->opt_config);
1349
    }
1350
    int next_exit = exit_count-1;
1351
    _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
1352
    assert(_PyUop_Uncached[buffer[0].opcode] == _START_EXECUTOR);
1353
    buffer[0].operand0 = (uint64_t)executor;
1354
    for (int i = length-1; i >= 0; i--) {
1355
        uint16_t base_opcode = _PyUop_Uncached[buffer[i].opcode];
1356
        dest--;
1357
        *dest = buffer[i];
1358
        if (base_opcode == _EXIT_TRACE || base_opcode == _DYNAMIC_EXIT) {
1359
            _PyExitData *exit = &executor->exits[next_exit];
1360
            exit->target = buffer[i].target;
1361
            dest->operand0 = (uint64_t)exit;
1362
            exit->executor = base_opcode == _EXIT_TRACE ? cold : cold_dynamic;
1363
            exit->is_dynamic = (char)(base_opcode == _DYNAMIC_EXIT);
1364
            exit->is_control_flow = (char)buffer[i].operand1;
1365
            next_exit--;
1366
        }
1367
    }
1368
    assert(next_exit == -1);
1369
    assert(dest == executor->trace);
1370
    assert(_PyUop_Uncached[dest->opcode] == _START_EXECUTOR);
1371
    // Note: we MUST track it here before any Py_DECREF(executor) or
1372
    // linking of executor. Otherwise, the GC tries to untrack a
1373
    // still untracked object during dealloc.
1374
    _PyObject_GC_TRACK(executor);
1375
    _Py_ExecutorInit(executor, dependencies);
1376
#ifdef Py_DEBUG
1377
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1378
    int lltrace = 0;
1379
    if (python_lltrace != NULL && *python_lltrace >= '0') {
1380
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
1381
    }
1382
    if (lltrace >= 2) {
1383
        printf("Optimized trace (length %d):\n", length);
1384
        for (int i = 0; i < length; i++) {
1385
            printf("%4d OPTIMIZED: ", i);
1386
            _PyUOpPrint(&executor->trace[i]);
1387
            printf("\n");
1388
        }
1389
    }
1390
    sanity_check(executor);
1391
#endif
1392
#ifdef _Py_JIT
1393
    executor->jit_code = NULL;
1394
    executor->jit_size = 0;
1395
    // This is initialized to false so we can prevent the executor
1396
    // from being immediately detected as cold and invalidated.
1397
    executor->vm_data.cold = false;
1398
    if (_PyJIT_Compile(executor, executor->trace, length)) {
1399
        Py_DECREF(executor);
1400
        return NULL;
1401
    }
1402
#endif
1403
    return executor;
1404
}
1405
1406
#ifdef Py_STATS
1407
/* Returns the effective trace length.
1408
 * Ignores NOPs and trailing exit and error handling.*/
1409
int effective_trace_length(_PyUOpInstruction *buffer, int length)
1410
{
1411
    int nop_count = 0;
1412
    for (int i = 0; i < length; i++) {
1413
        int opcode = buffer[i].opcode;
1414
        if (opcode == _NOP) {
1415
            nop_count++;
1416
        }
1417
        if (is_terminator(&buffer[i])) {
1418
            return i+1-nop_count;
1419
        }
1420
    }
1421
    Py_FatalError("No terminating instruction");
1422
    Py_UNREACHABLE();
1423
}
1424
#endif
1425
1426
1427
static int
1428
stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int length)
1429
{
1430
    assert(buffer[0].opcode == _START_EXECUTOR);
1431
    /* The input buffer and output buffers will overlap.
1432
       Make sure that we can move instructions to the output
1433
       without overwriting the input. */
1434
    if (buffer == output) {
1435
        // This can only happen if optimizer has not been run
1436
        for (int i = 0; i < length; i++) {
1437
            buffer[i + UOP_MAX_TRACE_LENGTH] = buffer[i];
1438
        }
1439
        buffer += UOP_MAX_TRACE_LENGTH;
1440
    }
1441
    else {
1442
        assert(output + UOP_MAX_TRACE_LENGTH == buffer);
1443
    }
1444
    int depth = 0;
1445
    _PyUOpInstruction *write = output;
1446
    for (int i = 0; i < length; i++) {
1447
        int uop = buffer[i].opcode;
1448
        if (uop == _NOP) {
1449
            continue;
1450
        }
1451
        int new_depth = _PyUop_Caching[uop].best[depth];
1452
        if (new_depth != depth) {
1453
            write->opcode = _PyUop_SpillsAndReloads[depth][new_depth];
1454
            assert(write->opcode != 0);
1455
            write->format = UOP_FORMAT_TARGET;
1456
            write->oparg = 0;
1457
            write->target = 0;
1458
            write++;
1459
            depth = new_depth;
1460
        }
1461
        *write = buffer[i];
1462
        uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode;
1463
        assert(new_opcode != 0);
1464
        write->opcode = new_opcode;
1465
        write++;
1466
        depth = _PyUop_Caching[uop].entries[depth].output;
1467
    }
1468
    return (int)(write - output);
1469
}
1470
1471
static int
1472
uop_optimize(
1473
    _PyInterpreterFrame *frame,
1474
    PyThreadState *tstate,
1475
    _PyExecutorObject **exec_ptr,
1476
    bool progress_needed)
1477
{
1478
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1479
    assert(_tstate->jit_tracer_state != NULL);
1480
    _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start;
1481
    OPT_STAT_INC(attempts);
1482
    bool is_noopt = !tstate->interp->opt_config.uops_optimize_enabled;
1483
    int curr_stackentries = _tstate->jit_tracer_state->initial_state.stack_depth;
1484
    int length = uop_buffer_length(&_tstate->jit_tracer_state->code_buffer);
1485
    if (length <= CODE_SIZE_NO_PROGRESS) {
1486
        return 0;
1487
    }
1488
    assert(length > 0);
1489
    assert(length < UOP_MAX_TRACE_LENGTH);
1490
    OPT_STAT_INC(traces_created);
1491
1492
    _PyBloomFilter dependencies;
1493
    _Py_BloomFilter_Init(&dependencies);
1494
    if (!is_noopt) {
1495
        _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH];
1496
        length = _Py_uop_analyze_and_optimize(
1497
            _tstate, buffer, length, curr_stackentries,
1498
            output, &dependencies);
1499
1500
        if (length <= 0) {
1501
            return length;
1502
        }
1503
        buffer = output;
1504
    }
1505
    assert(length < UOP_MAX_TRACE_LENGTH);
1506
    assert(length >= 1);
1507
    /* Fix up */
1508
    for (int pc = 0; pc < length; pc++) {
1509
        int opcode = buffer[pc].opcode;
1510
        int oparg = buffer[pc].oparg;
1511
        if (oparg < _PyUop_Replication[opcode].stop && oparg >= _PyUop_Replication[opcode].start) {
1512
            buffer[pc].opcode = opcode + oparg + 1 - _PyUop_Replication[opcode].start;
1513
            assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0);
1514
        }
1515
        else if (_PyUop_Flags[opcode] & HAS_RECORDS_VALUE_FLAG) {
1516
            Py_XDECREF((PyObject *)(uintptr_t)buffer[pc].operand0);
1517
            buffer[pc].opcode = _NOP;
1518
        }
1519
        else if (is_terminator(&buffer[pc])) {
1520
            break;
1521
        }
1522
        assert(_PyOpcode_uop_name[buffer[pc].opcode]);
1523
    }
1524
    OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
1525
    _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0];
1526
    length = stack_allocate(buffer, output, length);
1527
    buffer = output;
1528
    length = prepare_for_execution(buffer, length);
1529
    assert(length <= UOP_MAX_TRACE_LENGTH);
1530
    _PyExecutorObject *executor = make_executor_from_uops(
1531
        _tstate, buffer, length, &dependencies);
1532
    if (executor == NULL) {
1533
        return -1;
1534
    }
1535
    assert(length <= UOP_MAX_TRACE_LENGTH);
1536
1537
    // Check executor coldness
1538
    // It's okay if this ends up going negative.
1539
    if (--tstate->interp->executor_creation_counter == 0) {
1540
        _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
1541
    }
1542
1543
    *exec_ptr = executor;
1544
    return 1;
1545
}
1546
1547
1548
/*****************************************
1549
 *        Executor management
1550
 ****************************************/
1551
1552
/* We use a bloomfilter with k = 6, m = 256
1553
 * The choice of k and the following constants
1554
 * could do with a more rigorous analysis,
1555
 * but here is a simple analysis:
1556
 *
1557
 * We want to keep the false positive rate low.
1558
 * For n = 5 (a trace depends on 5 objects),
1559
 * we expect 30 bits set, giving a false positive
1560
 * rate of (30/256)**6 == 2.5e-6 which is plenty
1561
 * good enough.
1562
 *
1563
 * However with n = 10 we expect 60 bits set (worst case),
1564
 * giving a false positive of (60/256)**6 == 0.0001
1565
 *
1566
 * We choose k = 6, rather than a higher number as
1567
 * it means the false positive rate grows slower for high n.
1568
 *
1569
 * n = 5, k = 6 => fp = 2.6e-6
1570
 * n = 5, k = 8 => fp = 3.5e-7
1571
 * n = 10, k = 6 => fp = 1.6e-4
1572
 * n = 10, k = 8 => fp = 0.9e-4
1573
 * n = 15, k = 6 => fp = 0.18%
1574
 * n = 15, k = 8 => fp = 0.23%
1575
 * n = 20, k = 6 => fp = 1.1%
1576
 * n = 20, k = 8 => fp = 2.3%
1577
 *
1578
 * The above analysis assumes perfect hash functions,
1579
 * but those don't exist, so the real false positive
1580
 * rates may be worse.
1581
 */
1582
1583
#define K 6
1584
1585
#define SEED 20221211
1586
1587
/* TO DO -- Use more modern hash functions with better distribution of bits */
1588
static uint64_t
1589
address_to_hash(void *ptr) {
1590
    assert(ptr != NULL);
1591
    uint64_t uhash = SEED;
1592
    uintptr_t addr = (uintptr_t)ptr;
1593
    for (int i = 0; i < SIZEOF_VOID_P; i++) {
1594
        uhash ^= addr & 255;
1595
        uhash *= (uint64_t)PyHASH_MULTIPLIER;
1596
        addr >>= 8;
1597
    }
1598
    return uhash;
1599
}
1600
1601
void
1602
_Py_BloomFilter_Init(_PyBloomFilter *bloom)
1603
{
1604
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1605
        bloom->bits[i] = 0;
1606
    }
1607
}
1608
1609
/* We want K hash functions that each set 1 bit.
1610
 * A hash function that sets 1 bit in M bits can be trivially
1611
 * derived from a log2(M) bit hash function.
1612
 * So we extract 8 (log2(256)) bits at a time from
1613
 * the 64bit hash. */
1614
void
1615
_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr)
1616
{
1617
    uint64_t hash = address_to_hash(ptr);
1618
    assert(K <= 8);
1619
    for (int i = 0; i < K; i++) {
1620
        uint8_t bits = hash & 255;
1621
        bloom->bits[bits >> 5] |= (1 << (bits&31));
1622
        hash >>= 8;
1623
    }
1624
}
1625
1626
static bool
1627
bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
1628
{
1629
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1630
        if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) {
1631
            return false;
1632
        }
1633
    }
1634
    return true;
1635
}
1636
1637
static void
1638
link_executor(_PyExecutorObject *executor)
1639
{
1640
    PyInterpreterState *interp = _PyInterpreterState_GET();
1641
    _PyExecutorLinkListNode *links = &executor->vm_data.links;
1642
    _PyExecutorObject *head = interp->executor_list_head;
1643
    if (head == NULL) {
1644
        interp->executor_list_head = executor;
1645
        links->previous = NULL;
1646
        links->next = NULL;
1647
    }
1648
    else {
1649
        assert(head->vm_data.links.previous == NULL);
1650
        links->previous = NULL;
1651
        links->next = head;
1652
        head->vm_data.links.previous = executor;
1653
        interp->executor_list_head = executor;
1654
    }
1655
    /* executor_list_head must be first in list */
1656
    assert(interp->executor_list_head->vm_data.links.previous == NULL);
1657
}
1658
1659
static void
1660
unlink_executor(_PyExecutorObject *executor)
1661
{
1662
    _PyExecutorLinkListNode *links = &executor->vm_data.links;
1663
    _PyExecutorObject *next = links->next;
1664
    _PyExecutorObject *prev = links->previous;
1665
    if (next != NULL) {
1666
        next->vm_data.links.previous = prev;
1667
    }
1668
    if (prev != NULL) {
1669
        prev->vm_data.links.next = next;
1670
    }
1671
    else {
1672
        // prev == NULL implies that executor is the list head
1673
        PyInterpreterState *interp = PyInterpreterState_Get();
1674
        assert(interp->executor_list_head == executor);
1675
        interp->executor_list_head = next;
1676
    }
1677
}
1678
1679
/* This must be called by optimizers before using the executor */
1680
void
1681
_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
1682
{
1683
    executor->vm_data.valid = true;
1684
    executor->vm_data.pending_deletion = 0;
1685
    executor->vm_data.code = NULL;
1686
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1687
        executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
1688
    }
1689
    link_executor(executor);
1690
}
1691
1692
static _PyExecutorObject *
1693
make_cold_executor(uint16_t opcode)
1694
{
1695
    _PyExecutorObject *cold = allocate_executor(0, 1);
1696
    if (cold == NULL) {
1697
        Py_FatalError("Cannot allocate core JIT code");
1698
    }
1699
    ((_PyUOpInstruction *)cold->trace)->opcode = opcode;
1700
    // This is initialized to false so we can prevent the executor
1701
    // from being immediately detected as cold and invalidated.
1702
    cold->vm_data.cold = false;
1703
#ifdef _Py_JIT
1704
    cold->jit_code = NULL;
1705
    cold->jit_size = 0;
1706
    if (_PyJIT_Compile(cold, cold->trace, 1)) {
1707
        Py_DECREF(cold);
1708
        Py_FatalError("Cannot allocate core JIT code");
1709
    }
1710
#endif
1711
    _Py_SetImmortal((PyObject *)cold);
1712
    return cold;
1713
}
1714
1715
_PyExecutorObject *
1716
_PyExecutor_GetColdExecutor(void)
1717
{
1718
    PyInterpreterState *interp = _PyInterpreterState_GET();
1719
    if (interp->cold_executor == NULL) {
1720
        return interp->cold_executor = make_cold_executor(_COLD_EXIT_r00);;
1721
    }
1722
    return interp->cold_executor;
1723
}
1724
1725
_PyExecutorObject *
1726
_PyExecutor_GetColdDynamicExecutor(void)
1727
{
1728
    PyInterpreterState *interp = _PyInterpreterState_GET();
1729
    if (interp->cold_dynamic_executor == NULL) {
1730
        interp->cold_dynamic_executor = make_cold_executor(_COLD_DYNAMIC_EXIT_r00);
1731
    }
1732
    return interp->cold_dynamic_executor;
1733
}
1734
1735
void
1736
_PyExecutor_ClearExit(_PyExitData *exit)
1737
{
1738
    if (exit == NULL) {
1739
        return;
1740
    }
1741
    _PyExecutorObject *old = exit->executor;
1742
    if (exit->is_dynamic) {
1743
        exit->executor = _PyExecutor_GetColdDynamicExecutor();
1744
    }
1745
    else {
1746
        exit->executor = _PyExecutor_GetColdExecutor();
1747
    }
1748
    Py_DECREF(old);
1749
}
1750
1751
/* Detaches the executor from the code object (if any) that
1752
 * holds a reference to it */
1753
void
1754
_Py_ExecutorDetach(_PyExecutorObject *executor)
1755
{
1756
    PyCodeObject *code = executor->vm_data.code;
1757
    if (code == NULL) {
1758
        return;
1759
    }
1760
    _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
1761
    assert(instruction->op.code == ENTER_EXECUTOR);
1762
    int index = instruction->op.arg;
1763
    assert(code->co_executors->executors[index] == executor);
1764
    instruction->op.code = executor->vm_data.opcode;
1765
    instruction->op.arg = executor->vm_data.oparg;
1766
    executor->vm_data.code = NULL;
1767
    code->co_executors->executors[index] = NULL;
1768
    Py_DECREF(executor);
1769
}
1770
1771
/* Executors can be invalidated at any time,
1772
   even with a stop-the-world lock held.
1773
   Consequently it must not run arbitrary code,
1774
   including Py_DECREF with a non-executor. */
1775
static void
1776
executor_invalidate(PyObject *op)
1777
{
1778
    _PyExecutorObject *executor = _PyExecutorObject_CAST(op);
1779
    if (!executor->vm_data.valid) {
1780
        return;
1781
    }
1782
    executor->vm_data.valid = 0;
1783
    unlink_executor(executor);
1784
    executor_clear_exits(executor);
1785
    _Py_ExecutorDetach(executor);
1786
    _PyObject_GC_UNTRACK(op);
1787
}
1788
1789
static int
1790
executor_clear(PyObject *op)
1791
{
1792
    executor_invalidate(op);
1793
    return 0;
1794
}
1795
1796
void
1797
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
1798
{
1799
    assert(executor->vm_data.valid);
1800
    _Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
1801
}
1802
1803
/* Invalidate all executors that depend on `obj`
1804
 * May cause other executors to be invalidated as well
1805
 */
1806
void
1807
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
1808
{
1809
    _PyBloomFilter obj_filter;
1810
    _Py_BloomFilter_Init(&obj_filter);
1811
    _Py_BloomFilter_Add(&obj_filter, obj);
1812
    /* Walk the list of executors */
1813
    /* TO DO -- Use a tree to avoid traversing as many objects */
1814
    PyObject *invalidate = PyList_New(0);
1815
    if (invalidate == NULL) {
1816
        goto error;
1817
    }
1818
    /* Clearing an executor can clear others, so we need to make a list of
1819
     * executors to invalidate first */
1820
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1821
        assert(exec->vm_data.valid);
1822
        _PyExecutorObject *next = exec->vm_data.links.next;
1823
        if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) &&
1824
            PyList_Append(invalidate, (PyObject *)exec))
1825
        {
1826
            goto error;
1827
        }
1828
        exec = next;
1829
    }
1830
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1831
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1832
        executor_invalidate(exec);
1833
        if (is_invalidation) {
1834
            OPT_STAT_INC(executors_invalidated);
1835
        }
1836
    }
1837
    Py_DECREF(invalidate);
1838
    return;
1839
error:
1840
    PyErr_Clear();
1841
    Py_XDECREF(invalidate);
1842
    // If we're truly out of memory, wiping out everything is a fine fallback:
1843
    _Py_Executors_InvalidateAll(interp, is_invalidation);
1844
}
1845
1846
/* Invalidate all executors */
1847
void
1848
_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
1849
{
1850
    while (interp->executor_list_head) {
1851
        _PyExecutorObject *executor = interp->executor_list_head;
1852
        assert(executor->vm_data.valid);
1853
        if (executor->vm_data.code) {
1854
            // Clear the entire code object so its co_executors array be freed:
1855
            _PyCode_Clear_Executors(executor->vm_data.code);
1856
        }
1857
        else {
1858
            executor_invalidate((PyObject *)executor);
1859
        }
1860
        if (is_invalidation) {
1861
            OPT_STAT_INC(executors_invalidated);
1862
        }
1863
    }
1864
}
1865
1866
void
1867
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
1868
{
1869
    /* Walk the list of executors */
1870
    /* TO DO -- Use a tree to avoid traversing as many objects */
1871
    PyObject *invalidate = PyList_New(0);
1872
    if (invalidate == NULL) {
1873
        goto error;
1874
    }
1875
1876
    /* Clearing an executor can deallocate others, so we need to make a list of
1877
     * executors to invalidate first */
1878
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1879
        assert(exec->vm_data.valid);
1880
        _PyExecutorObject *next = exec->vm_data.links.next;
1881
1882
        if (exec->vm_data.cold && PyList_Append(invalidate, (PyObject *)exec) < 0) {
1883
            goto error;
1884
        }
1885
        else {
1886
            exec->vm_data.cold = true;
1887
        }
1888
1889
        exec = next;
1890
    }
1891
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1892
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1893
        executor_invalidate(exec);
1894
    }
1895
    Py_DECREF(invalidate);
1896
    return;
1897
error:
1898
    PyErr_Clear();
1899
    Py_XDECREF(invalidate);
1900
    // If we're truly out of memory, wiping out everything is a fine fallback
1901
    _Py_Executors_InvalidateAll(interp, 0);
1902
}
1903
1904
#include "record_functions.c.h"
1905
1906
static int
1907
escape_angles(const char *input, Py_ssize_t size, char *buffer) {
1908
    int written = 0;
1909
    for (Py_ssize_t i = 0; i < size; i++) {
1910
        char c = input[i];
1911
        if (c == '<' || c == '>') {
1912
            buffer[written++] = '&';
1913
            buffer[written++] = c == '>' ? 'g' : 'l';
1914
            buffer[written++] = 't';
1915
            buffer[written++] = ';';
1916
        }
1917
        else {
1918
            buffer[written++] = c;
1919
        }
1920
    }
1921
    return written;
1922
}
1923
1924
static void
1925
write_str(PyObject *str, FILE *out)
1926
{
1927
    // Encode the Unicode object to the specified encoding
1928
    PyObject *encoded_obj = PyUnicode_AsEncodedString(str, "utf8", "strict");
1929
    if (encoded_obj == NULL) {
1930
        PyErr_Clear();
1931
        return;
1932
    }
1933
    const char *encoded_str = PyBytes_AsString(encoded_obj);
1934
    Py_ssize_t encoded_size = PyBytes_Size(encoded_obj);
1935
    char buffer[120];
1936
    bool truncated = false;
1937
    if (encoded_size > 24) {
1938
        encoded_size = 24;
1939
        truncated = true;
1940
    }
1941
    int size = escape_angles(encoded_str, encoded_size, buffer);
1942
    fwrite(buffer, 1, size, out);
1943
    if (truncated) {
1944
        fwrite("...", 1, 3, out);
1945
    }
1946
    Py_DECREF(encoded_obj);
1947
}
1948
1949
static int
1950
find_line_number(PyCodeObject *code, _PyExecutorObject *executor)
1951
{
1952
    int code_len = (int)Py_SIZE(code);
1953
    for (int i = 0; i < code_len; i++) {
1954
        _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
1955
        int opcode = instr->op.code;
1956
        if (opcode == ENTER_EXECUTOR) {
1957
            _PyExecutorObject *exec = code->co_executors->executors[instr->op.arg];
1958
            if (exec == executor) {
1959
                return PyCode_Addr2Line(code, i*2);
1960
            }
1961
        }
1962
        i += _PyOpcode_Caches[_Py_GetBaseCodeUnit(code, i).op.code];
1963
    }
1964
    return -1;
1965
}
1966
1967
#define RED "#ff0000"
1968
#define WHITE "#ffffff"
1969
#define BLUE "#0000ff"
1970
#define BLACK "#000000"
1971
#define LOOP "#00c000"
1972
1973
#ifdef Py_STATS
1974
1975
static const char *COLORS[10] = {
1976
    "9",
1977
    "8",
1978
    "7",
1979
    "6",
1980
    "5",
1981
    "4",
1982
    "3",
1983
    "2",
1984
    "1",
1985
    WHITE,
1986
};
1987
const char *
1988
get_background_color(_PyUOpInstruction const *inst, uint64_t max_hotness)
1989
{
1990
    uint64_t hotness = inst->execution_count;
1991
    int index = (hotness * 10)/max_hotness;
1992
    if (index > 9) {
1993
        index = 9;
1994
    }
1995
    if (index < 0) {
1996
        index = 0;
1997
    }
1998
    return COLORS[index];
1999
}
2000
2001
const char *
2002
get_foreground_color(_PyUOpInstruction const *inst, uint64_t max_hotness)
2003
{
2004
    if(_PyUop_Uncached[inst->opcode] == _DEOPT) {
2005
        return RED;
2006
    }
2007
    uint64_t hotness = inst->execution_count;
2008
    int index = (hotness * 10)/max_hotness;
2009
    if (index > 3) {
2010
        return BLACK;
2011
    }
2012
    return WHITE;
2013
}
2014
#endif
2015
2016
static void
2017
write_row_for_uop(_PyExecutorObject *executor, uint32_t i, FILE *out)
2018
{
2019
    /* Write row for uop.
2020
        * The `port` is a marker so that outgoing edges can
2021
        * be placed correctly. If a row is marked `port=17`,
2022
        * then the outgoing edge is `{EXEC_NAME}:17 -> {TARGET}`
2023
        * https://graphviz.readthedocs.io/en/stable/manual.html#node-ports-compass
2024
        */
2025
    _PyUOpInstruction const *inst = &executor->trace[i];
2026
    const char *opname = _PyOpcode_uop_name[inst->opcode];
2027
#ifdef Py_STATS
2028
    const char *bg_color = get_background_color(inst, executor->trace[0].execution_count);
2029
    const char *color = get_foreground_color(inst, executor->trace[0].execution_count);
2030
    fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" color=\"%s\" bgcolor=\"%s\" ><font color=\"%s\"> %s &nbsp;--&nbsp; %" PRIu64 "</font></td></tr>\n",
2031
        i, color, bg_color, color, opname, inst->execution_count);
2032
#else
2033
    const char *color = (_PyUop_Uncached[inst->opcode] == _DEOPT) ? RED : BLACK;
2034
    fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" color=\"%s\" >%s op0=%" PRIu64 "</td></tr>\n", i, color, opname, inst->operand0);
2035
#endif
2036
}
2037
2038
static bool
2039
is_stop(_PyUOpInstruction const *inst)
2040
{
2041
    uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
2042
    return (base_opcode == _EXIT_TRACE || base_opcode == _DEOPT || base_opcode == _JUMP_TO_TOP);
2043
}
2044
2045
2046
/* Writes the node and outgoing edges for a single tracelet in graphviz format.
2047
 * Each tracelet is presented as a table of the uops it contains.
2048
 * If Py_STATS is enabled, execution counts are included.
2049
 *
2050
 * https://graphviz.readthedocs.io/en/stable/manual.html
2051
 * https://graphviz.org/gallery/
2052
 */
2053
static void
2054
executor_to_gv(_PyExecutorObject *executor, FILE *out)
2055
{
2056
    PyCodeObject *code = executor->vm_data.code;
2057
    fprintf(out, "executor_%p [\n", executor);
2058
    fprintf(out, "    shape = none\n");
2059
2060
    /* Write the HTML table for the uops */
2061
    fprintf(out, "    label = <<table border=\"0\" cellspacing=\"0\">\n");
2062
    fprintf(out, "        <tr><td port=\"start\" border=\"1\" ><b>Executor</b></td></tr>\n");
2063
    if (code == NULL) {
2064
        fprintf(out, "        <tr><td border=\"1\" >No code object</td></tr>\n");
2065
    }
2066
    else {
2067
        fprintf(out, "        <tr><td  border=\"1\" >");
2068
        write_str(code->co_qualname, out);
2069
        int line = find_line_number(code, executor);
2070
        fprintf(out, ": %d</td></tr>\n", line);
2071
    }
2072
    for (uint32_t i = 0; i < executor->code_size; i++) {
2073
        write_row_for_uop(executor, i, out);
2074
        if (is_stop(&executor->trace[i])) {
2075
            break;
2076
        }
2077
    }
2078
    fprintf(out, "    </table>>\n");
2079
    fprintf(out, "]\n\n");
2080
2081
    /* Write all the outgoing edges */
2082
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
2083
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
2084
    for (uint32_t i = 0; i < executor->code_size; i++) {
2085
        _PyUOpInstruction const *inst = &executor->trace[i];
2086
        uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
2087
        uint16_t flags = _PyUop_Flags[base_opcode];
2088
        _PyExitData *exit = NULL;
2089
        if (base_opcode == _JUMP_TO_TOP) {
2090
            fprintf(out, "executor_%p:i%d -> executor_%p:i%d [color = \"" LOOP "\"]\n", executor, i, executor, inst->jump_target);
2091
            break;
2092
        }
2093
        if (base_opcode == _EXIT_TRACE) {
2094
            exit = (_PyExitData *)inst->operand0;
2095
        }
2096
        else if (flags & HAS_EXIT_FLAG) {
2097
            assert(inst->format == UOP_FORMAT_JUMP);
2098
            _PyUOpInstruction const *exit_inst = &executor->trace[inst->jump_target];
2099
            uint16_t base_exit_opcode = _PyUop_Uncached[exit_inst->opcode];
2100
            (void)base_exit_opcode;
2101
            assert(base_exit_opcode == _EXIT_TRACE || base_exit_opcode == _DYNAMIC_EXIT);
2102
            exit = (_PyExitData *)exit_inst->operand0;
2103
        }
2104
        if (exit != NULL) {
2105
            if (exit->executor == cold || exit->executor == cold_dynamic) {
2106
#ifdef Py_STATS
2107
                /* Only mark as have cold exit if it has actually exited */
2108
                uint64_t diff = inst->execution_count - executor->trace[i+1].execution_count;
2109
                if (diff) {
2110
                    fprintf(out, "cold_%p%d [ label = \"%"  PRIu64  "\" shape = ellipse color=\"" BLUE "\" ]\n", executor, i, diff);
2111
                    fprintf(out, "executor_%p:i%d -> cold_%p%d\n", executor, i, executor, i);
2112
                }
2113
#endif
2114
            }
2115
            else {
2116
                fprintf(out, "executor_%p:i%d -> executor_%p:start\n", executor, i, exit->executor);
2117
            }
2118
        }
2119
        if (is_stop(inst)) {
2120
            break;
2121
        }
2122
    }
2123
}
2124
2125
/* Write the graph of all the live tracelets in graphviz format. */
2126
int
2127
_PyDumpExecutors(FILE *out)
2128
{
2129
    fprintf(out, "digraph ideal {\n\n");
2130
    fprintf(out, "    rankdir = \"LR\"\n\n");
2131
    fprintf(out, "    node [colorscheme=greys9]\n");
2132
    PyInterpreterState *interp = PyInterpreterState_Get();
2133
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
2134
        executor_to_gv(exec, out);
2135
        exec = exec->vm_data.links.next;
2136
    }
2137
    fprintf(out, "}\n\n");
2138
    return 0;
2139
}
2140
2141
#else
2142
2143
int
2144
_PyDumpExecutors(FILE *out)
2145
0
{
2146
0
    PyErr_SetString(PyExc_NotImplementedError, "No JIT available");
2147
0
    return -1;
2148
0
}
2149
2150
void
2151
_PyExecutor_Free(struct _PyExecutorObject *self)
2152
0
{
2153
    /* This should never be called */
2154
0
    Py_UNREACHABLE();
2155
0
}
2156
2157
#endif /* _Py_TIER2 */