Coverage Report

Created: 2026-03-23 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/optimizer.c
Line
Count
Source
1
#include "Python.h"
2
3
#ifdef _Py_TIER2
4
5
#include "opcode.h"
6
#include "pycore_interp.h"
7
#include "pycore_backoff.h"
8
#include "pycore_bitutils.h"        // _Py_popcount32()
9
#include "pycore_ceval.h"       // _Py_set_eval_breaker_bit
10
#include "pycore_code.h"            // _Py_GetBaseCodeUnit
11
#include "pycore_interpframe.h"
12
#include "pycore_object.h"          // _PyObject_GC_UNTRACK()
13
#include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
14
#include "pycore_opcode_utils.h"  // MAX_REAL_OPCODE
15
#include "pycore_optimizer.h"     // _Py_uop_analyze_and_optimize()
16
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
17
#include "pycore_tuple.h"         // _PyTuple_FromArraySteal
18
#include "pycore_unicodeobject.h" // _PyUnicode_FromASCII
19
#include "pycore_uop_ids.h"
20
#include "pycore_jit.h"
21
#include <stdbool.h>
22
#include <stdint.h>
23
#include <stddef.h>
24
25
#define NEED_OPCODE_METADATA
26
#include "pycore_uop_metadata.h" // Uop tables
27
#undef NEED_OPCODE_METADATA
28
29
#define MAX_EXECUTORS_SIZE 256
30
31
// Trace too short, no progress:
32
// _START_EXECUTOR
33
// _MAKE_WARM
34
// _CHECK_VALIDITY
35
// _SET_IP
36
// is 4-5 instructions.
37
#define CODE_SIZE_NO_PROGRESS 5
38
// We start with _START_EXECUTOR, _MAKE_WARM
39
#define CODE_SIZE_EMPTY 2
40
41
#define _PyExecutorObject_CAST(op)  ((_PyExecutorObject *)(op))
42
43
#ifndef Py_GIL_DISABLED
44
static bool
45
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
46
{
47
    if (code == (PyCodeObject *)&_Py_InitCleanup) {
48
        return false;
49
    }
50
    if (instr->op.code == ENTER_EXECUTOR) {
51
        return true;
52
    }
53
    if (code->co_executors == NULL) {
54
        return true;
55
    }
56
    return code->co_executors->size < MAX_EXECUTORS_SIZE;
57
}
58
59
static int32_t
60
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
61
{
62
    if (instr->op.code == ENTER_EXECUTOR) {
63
        return instr->op.arg;
64
    }
65
    _PyExecutorArray *old = code->co_executors;
66
    int size = 0;
67
    int capacity = 0;
68
    if (old != NULL) {
69
        size = old->size;
70
        capacity = old->capacity;
71
        assert(size < MAX_EXECUTORS_SIZE);
72
    }
73
    assert(size <= capacity);
74
    if (size == capacity) {
75
        /* Array is full. Grow array */
76
        int new_capacity = capacity ? capacity * 2 : 4;
77
        _PyExecutorArray *new = PyMem_Realloc(
78
            old,
79
            offsetof(_PyExecutorArray, executors) +
80
            new_capacity * sizeof(_PyExecutorObject *));
81
        if (new == NULL) {
82
            return -1;
83
        }
84
        new->capacity = new_capacity;
85
        new->size = size;
86
        code->co_executors = new;
87
    }
88
    assert(size < code->co_executors->capacity);
89
    return size;
90
}
91
92
static void
93
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
94
{
95
    Py_INCREF(executor);
96
    if (instr->op.code == ENTER_EXECUTOR) {
97
        assert(index == instr->op.arg);
98
        _Py_ExecutorDetach(code->co_executors->executors[index]);
99
    }
100
    else {
101
        assert(code->co_executors->size == index);
102
        assert(code->co_executors->capacity > index);
103
        code->co_executors->size++;
104
    }
105
    executor->vm_data.opcode = instr->op.code;
106
    executor->vm_data.oparg = instr->op.arg;
107
    executor->vm_data.code = code;
108
    executor->vm_data.index = (int)(instr - _PyCode_CODE(code));
109
    code->co_executors->executors[index] = executor;
110
    assert(index < MAX_EXECUTORS_SIZE);
111
    instr->op.code = ENTER_EXECUTOR;
112
    instr->op.arg = index;
113
}
114
#endif // Py_GIL_DISABLED
115
116
static _PyExecutorObject *
117
make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies);
118
119
static int
120
uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate,
121
             _PyExecutorObject **exec_ptr,
122
             bool progress_needed);
123
124
/* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
125
 * If optimized, *executor_ptr contains a new reference to the executor
126
 */
127
// gh-137573: inlining this function causes stack overflows
128
Py_NO_INLINE int
129
_PyOptimizer_Optimize(
130
    _PyInterpreterFrame *frame, PyThreadState *tstate)
131
{
132
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
133
    PyInterpreterState *interp = _PyInterpreterState_GET();
134
    if (!interp->jit) {
135
        // gh-140936: It is possible that interp->jit will become false during
136
        // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT
137
        // instruction may still be present. In this case, we should
138
        // return immediately without optimization.
139
        return 0;
140
    }
141
    _PyExecutorObject *prev_executor = _tstate->jit_tracer_state->initial_state.executor;
142
    if (prev_executor != NULL && !prev_executor->vm_data.valid) {
143
        // gh-143604: If we are a side exit executor and the original executor is no
144
        // longer valid, don't compile to prevent a reference leak.
145
        return 0;
146
    }
147
    assert(!interp->compiling);
148
    assert(_tstate->jit_tracer_state->initial_state.stack_depth >= 0);
149
#ifndef Py_GIL_DISABLED
150
    assert(_tstate->jit_tracer_state->initial_state.func != NULL);
151
    interp->compiling = true;
152
    // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
153
    // make progress in order to avoid infinite loops or excessively-long
154
    // side-exit chains. We can only insert the executor into the bytecode if
155
    // this is true, since a deopt won't infinitely re-enter the executor:
156
    int chain_depth = _tstate->jit_tracer_state->initial_state.chain_depth;
157
    chain_depth %= MAX_CHAIN_DEPTH;
158
    bool progress_needed = chain_depth == 0;
159
    PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state->initial_state.code;
160
    _Py_CODEUNIT *start = _tstate->jit_tracer_state->initial_state.start_instr;
161
    if (progress_needed && !has_space_for_executor(code, start)) {
162
        interp->compiling = false;
163
        return 0;
164
    }
165
    _PyExecutorObject *executor;
166
    int err = uop_optimize(frame, tstate, &executor, progress_needed);
167
    if (err <= 0) {
168
        interp->compiling = false;
169
        return err;
170
    }
171
    assert(executor != NULL);
172
    if (progress_needed) {
173
        int index = get_index_for_executor(code, start);
174
        if (index < 0) {
175
            /* Out of memory. Don't raise and assume that the
176
             * error will show up elsewhere.
177
             *
178
             * If an optimizer has already produced an executor,
179
             * it might get confused by the executor disappearing,
180
             * but there is not much we can do about that here. */
181
            Py_DECREF(executor);
182
            interp->compiling = false;
183
            return 0;
184
        }
185
        insert_executor(code, start, index, executor);
186
    }
187
    executor->vm_data.chain_depth = chain_depth;
188
    assert(executor->vm_data.valid);
189
    _PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit;
190
    if (exit != NULL && !progress_needed) {
191
        exit->executor = executor;
192
    }
193
    else {
194
        // An executor inserted into the code object now has a strong reference
195
        // to it from the code object. Thus, we don't need this reference anymore.
196
        Py_DECREF(executor);
197
    }
198
    interp->compiling = false;
199
    return 1;
200
#else
201
    return 0;
202
#endif
203
}
204
205
static _PyExecutorObject *
206
get_executor_lock_held(PyCodeObject *code, int offset)
207
{
208
    int code_len = (int)Py_SIZE(code);
209
    for (int i = 0 ; i < code_len;) {
210
        if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) {
211
            int oparg = _PyCode_CODE(code)[i].op.arg;
212
            _PyExecutorObject *res = code->co_executors->executors[oparg];
213
            Py_INCREF(res);
214
            return res;
215
        }
216
        i += _PyInstruction_GetLength(code, i);
217
    }
218
    PyErr_SetString(PyExc_ValueError, "no executor at given byte offset");
219
    return NULL;
220
}
221
222
_PyExecutorObject *
223
_Py_GetExecutor(PyCodeObject *code, int offset)
224
{
225
    _PyExecutorObject *executor;
226
    Py_BEGIN_CRITICAL_SECTION(code);
227
    executor = get_executor_lock_held(code, offset);
228
    Py_END_CRITICAL_SECTION();
229
    return executor;
230
}
231
232
static PyObject *
233
is_valid(PyObject *self, PyObject *Py_UNUSED(ignored))
234
{
235
    return PyBool_FromLong(((_PyExecutorObject *)self)->vm_data.valid);
236
}
237
238
static PyObject *
239
get_opcode(PyObject *self, PyObject *Py_UNUSED(ignored))
240
{
241
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.opcode);
242
}
243
244
static PyObject *
245
get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
246
{
247
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.oparg);
248
}
249
250
///////////////////// Experimental UOp Optimizer /////////////////////
251
252
static int executor_clear(PyObject *executor);
253
254
void
255
_PyExecutor_Free(_PyExecutorObject *self)
256
{
257
#ifdef _Py_JIT
258
    _PyJIT_Free(self);
259
#endif
260
    PyObject_GC_Del(self);
261
}
262
263
static void executor_invalidate(PyObject *op);
264
265
static void
266
executor_clear_exits(_PyExecutorObject *executor)
267
{
268
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
269
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
270
    for (uint32_t i = 0; i < executor->exit_count; i++) {
271
        _PyExitData *exit = &executor->exits[i];
272
        exit->temperature = initial_unreachable_backoff_counter();
273
        _PyExecutorObject *old = executor->exits[i].executor;
274
        exit->executor = exit->is_dynamic ? cold_dynamic : cold;
275
        Py_DECREF(old);
276
    }
277
}
278
279
280
void
281
_Py_ClearExecutorDeletionList(PyInterpreterState *interp)
282
{
283
    if (interp->executor_deletion_list_head == NULL) {
284
        return;
285
    }
286
    _PyRuntimeState *runtime = &_PyRuntime;
287
    HEAD_LOCK(runtime);
288
    PyThreadState* ts = PyInterpreterState_ThreadHead(interp);
289
    while (ts) {
290
        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
291
        Py_XINCREF(current);
292
        ts = ts->next;
293
    }
294
    HEAD_UNLOCK(runtime);
295
    _PyExecutorObject *keep_list = NULL;
296
    do {
297
        _PyExecutorObject *exec = interp->executor_deletion_list_head;
298
        interp->executor_deletion_list_head = exec->vm_data.links.next;
299
        if (Py_REFCNT(exec) == 0) {
300
            _PyExecutor_Free(exec);
301
        } else {
302
            exec->vm_data.links.next = keep_list;
303
            keep_list = exec;
304
        }
305
    } while (interp->executor_deletion_list_head != NULL);
306
    interp->executor_deletion_list_head = keep_list;
307
    HEAD_LOCK(runtime);
308
    ts = PyInterpreterState_ThreadHead(interp);
309
    while (ts) {
310
        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
311
        if (current != NULL) {
312
            Py_DECREF((PyObject *)current);
313
        }
314
        ts = ts->next;
315
    }
316
    HEAD_UNLOCK(runtime);
317
}
318
319
static void
320
add_to_pending_deletion_list(_PyExecutorObject *self)
321
{
322
    if (self->vm_data.pending_deletion) {
323
        return;
324
    }
325
    self->vm_data.pending_deletion = 1;
326
    PyInterpreterState *interp = PyInterpreterState_Get();
327
    self->vm_data.links.previous = NULL;
328
    self->vm_data.links.next = interp->executor_deletion_list_head;
329
    interp->executor_deletion_list_head = self;
330
}
331
332
static void
333
uop_dealloc(PyObject *op) {
334
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
335
    executor_invalidate(op);
336
    assert(self->vm_data.code == NULL);
337
    add_to_pending_deletion_list(self);
338
}
339
340
const char *
341
_PyUOpName(int index)
342
{
343
    if (index < 0 || index > MAX_UOP_REGS_ID) {
344
        return NULL;
345
    }
346
    return _PyOpcode_uop_name[index];
347
}
348
349
#ifdef Py_DEBUG
350
void
351
_PyUOpPrint(const _PyUOpInstruction *uop)
352
{
353
    const char *name = _PyUOpName(uop->opcode);
354
    if (name == NULL) {
355
        printf("<uop %d>", uop->opcode);
356
    }
357
    else {
358
        printf("%s", name);
359
    }
360
    switch(uop->format) {
361
        case UOP_FORMAT_TARGET:
362
            printf(" (%d, target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
363
                uop->oparg,
364
                uop->target,
365
                (uint64_t)uop->operand0,
366
                (uint64_t)uop->operand1);
367
            break;
368
        case UOP_FORMAT_JUMP:
369
            printf(" (%d, jump_target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
370
                uop->oparg,
371
                uop->jump_target,
372
                (uint64_t)uop->operand0,
373
                (uint64_t)uop->operand1);
374
            break;
375
        default:
376
            printf(" (%d, Unknown format)", uop->oparg);
377
    }
378
    if (_PyUop_Flags[_PyUop_Uncached[uop->opcode]] & HAS_ERROR_FLAG) {
379
        printf(", error_target=%d", uop->error_target);
380
    }
381
382
    printf(")");
383
}
384
#endif
385
386
static Py_ssize_t
387
uop_len(PyObject *op)
388
{
389
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
390
    return self->code_size;
391
}
392
393
static PyObject *
394
uop_item(PyObject *op, Py_ssize_t index)
395
{
396
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
397
    Py_ssize_t len = uop_len(op);
398
    if (index < 0 || index >= len) {
399
        PyErr_SetNone(PyExc_IndexError);
400
        return NULL;
401
    }
402
    int opcode = self->trace[index].opcode;
403
    int base_opcode = _PyUop_Uncached[opcode];
404
    const char *name = _PyUOpName(base_opcode);
405
    if (name == NULL) {
406
        name = "<nil>";
407
    }
408
    PyObject *oname = _PyUnicode_FromASCII(name, strlen(name));
409
    if (oname == NULL) {
410
        return NULL;
411
    }
412
    PyObject *oparg = PyLong_FromUnsignedLong(self->trace[index].oparg);
413
    if (oparg == NULL) {
414
        Py_DECREF(oname);
415
        return NULL;
416
    }
417
    PyObject *target = PyLong_FromUnsignedLong(self->trace[index].target);
418
    if (target == NULL) {
419
        Py_DECREF(oparg);
420
        Py_DECREF(oname);
421
        return NULL;
422
    }
423
    PyObject *operand = PyLong_FromUnsignedLongLong(self->trace[index].operand0);
424
    if (operand == NULL) {
425
        Py_DECREF(target);
426
        Py_DECREF(oparg);
427
        Py_DECREF(oname);
428
        return NULL;
429
    }
430
    PyObject *args[4] = { oname, oparg, target, operand };
431
    return _PyTuple_FromArraySteal(args, 4);
432
}
433
434
PySequenceMethods uop_as_sequence = {
435
    .sq_length = uop_len,
436
    .sq_item = uop_item,
437
};
438
439
static int
440
executor_traverse(PyObject *o, visitproc visit, void *arg)
441
{
442
    _PyExecutorObject *executor = _PyExecutorObject_CAST(o);
443
    for (uint32_t i = 0; i < executor->exit_count; i++) {
444
        Py_VISIT(executor->exits[i].executor);
445
    }
446
    return 0;
447
}
448
449
static PyObject *
450
get_jit_code(PyObject *self, PyObject *Py_UNUSED(ignored))
451
{
452
#ifndef _Py_JIT
453
    PyErr_SetString(PyExc_RuntimeError, "JIT support not enabled.");
454
    return NULL;
455
#else
456
    _PyExecutorObject *executor = _PyExecutorObject_CAST(self);
457
    if (executor->jit_code == NULL || executor->jit_size == 0) {
458
        Py_RETURN_NONE;
459
    }
460
    return PyBytes_FromStringAndSize(executor->jit_code, executor->jit_size);
461
#endif
462
}
463
464
static PyMethodDef uop_executor_methods[] = {
465
    { "is_valid", is_valid, METH_NOARGS, NULL },
466
    { "get_jit_code", get_jit_code, METH_NOARGS, NULL},
467
    { "get_opcode", get_opcode, METH_NOARGS, NULL },
468
    { "get_oparg", get_oparg, METH_NOARGS, NULL },
469
    { NULL, NULL },
470
};
471
472
static int
473
executor_is_gc(PyObject *o)
474
{
475
    return !_Py_IsImmortal(o);
476
}
477
478
PyTypeObject _PyUOpExecutor_Type = {
479
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
480
    .tp_name = "uop_executor",
481
    .tp_basicsize = offsetof(_PyExecutorObject, exits),
482
    .tp_itemsize = 1,
483
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
484
    .tp_dealloc = uop_dealloc,
485
    .tp_as_sequence = &uop_as_sequence,
486
    .tp_methods = uop_executor_methods,
487
    .tp_traverse = executor_traverse,
488
    .tp_clear = executor_clear,
489
    .tp_is_gc = executor_is_gc,
490
};
491
492
/* TO DO -- Generate these tables */
493
static const uint16_t
494
_PyUOp_Replacements[MAX_UOP_ID + 1] = {
495
    [_ITER_JUMP_RANGE] = _GUARD_NOT_EXHAUSTED_RANGE,
496
    [_ITER_JUMP_LIST] = _GUARD_NOT_EXHAUSTED_LIST,
497
    [_ITER_JUMP_TUPLE] = _GUARD_NOT_EXHAUSTED_TUPLE,
498
    [_FOR_ITER] = _FOR_ITER_TIER_TWO,
499
    [_ITER_NEXT_LIST] = _ITER_NEXT_LIST_TIER_TWO,
500
    [_CHECK_PERIODIC_AT_END] = _TIER2_RESUME_CHECK,
501
};
502
503
static const uint8_t
504
is_for_iter_test[MAX_UOP_ID + 1] = {
505
    [_GUARD_NOT_EXHAUSTED_RANGE] = 1,
506
    [_GUARD_NOT_EXHAUSTED_LIST] = 1,
507
    [_GUARD_NOT_EXHAUSTED_TUPLE] = 1,
508
    [_FOR_ITER_TIER_TWO] = 1,
509
};
510
511
static const uint16_t
512
BRANCH_TO_GUARD[4][2] = {
513
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_TRUE_POP,
514
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_FALSE_POP,
515
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_FALSE_POP,
516
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_TRUE_POP,
517
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NOT_NONE_POP,
518
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NONE_POP,
519
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NONE_POP,
520
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
521
};
522
523
static const uint16_t
524
guard_ip_uop[MAX_UOP_ID + 1] = {
525
    [_PUSH_FRAME] = _GUARD_IP__PUSH_FRAME,
526
    [_RETURN_GENERATOR] = _GUARD_IP_RETURN_GENERATOR,
527
    [_RETURN_VALUE] = _GUARD_IP_RETURN_VALUE,
528
    [_YIELD_VALUE] = _GUARD_IP_YIELD_VALUE,
529
};
530
531
static const uint16_t
532
guard_code_version_uop[MAX_UOP_ID + 1] = {
533
    [_PUSH_FRAME] = _GUARD_CODE_VERSION__PUSH_FRAME,
534
    [_RETURN_GENERATOR] = _GUARD_CODE_VERSION_RETURN_GENERATOR,
535
    [_RETURN_VALUE] = _GUARD_CODE_VERSION_RETURN_VALUE,
536
    [_YIELD_VALUE] = _GUARD_CODE_VERSION_YIELD_VALUE,
537
};
538
539
static const uint16_t
540
dynamic_exit_uop[MAX_UOP_ID + 1] = {
541
    [_GUARD_IP__PUSH_FRAME] = 1,
542
    [_GUARD_IP_RETURN_GENERATOR] = 1,
543
    [_GUARD_IP_RETURN_VALUE] = 1,
544
    [_GUARD_IP_YIELD_VALUE] = 1,
545
    [_GUARD_CODE_VERSION__PUSH_FRAME] = 1,
546
    [_GUARD_CODE_VERSION_RETURN_GENERATOR] = 1,
547
    [_GUARD_CODE_VERSION_RETURN_VALUE] = 1,
548
    [_GUARD_CODE_VERSION_YIELD_VALUE] = 1,
549
};
550
551
552
#define CONFIDENCE_RANGE 1000
553
#define CONFIDENCE_CUTOFF 333
554
555
#ifdef Py_DEBUG
556
#define DPRINTF(level, ...) \
557
    if (lltrace >= (level)) { printf(__VA_ARGS__); }
558
#else
559
#define DPRINTF(level, ...)
560
#endif
561
562
563
static inline void
564
add_to_trace(
565
    _PyJitUopBuffer *trace,
566
    uint16_t opcode,
567
    uint16_t oparg,
568
    uint64_t operand,
569
    uint32_t target)
570
{
571
    _PyUOpInstruction *inst = trace->next;
572
    inst->opcode = opcode;
573
    inst->format = UOP_FORMAT_TARGET;
574
    inst->target = target;
575
    inst->oparg = oparg;
576
    inst->operand0 = operand;
577
#ifdef Py_STATS
578
    inst->execution_count = 0;
579
#endif
580
    trace->next++;
581
}
582
583
584
#ifdef Py_DEBUG
585
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
586
    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
587
    if (lltrace >= 2) { \
588
        printf("%4d ADD_TO_TRACE: ", uop_buffer_length(trace)); \
589
        _PyUOpPrint(uop_buffer_last(trace)); \
590
        printf("\n"); \
591
    }
592
#else
593
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
594
    add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET))
595
#endif
596
597
#define INSTR_IP(INSTR, CODE) \
598
    ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
599
600
601
static int
602
is_terminator(const _PyUOpInstruction *uop)
603
{
604
    int opcode = _PyUop_Uncached[uop->opcode];
605
    return (
606
        opcode == _EXIT_TRACE ||
607
        opcode == _DEOPT ||
608
        opcode == _JUMP_TO_TOP ||
609
        opcode == _DYNAMIC_EXIT
610
    );
611
}
612
613
/* Returns 1 on success (added to trace), 0 on trace end.
614
 */
615
// gh-142543: inlining this function causes stack overflows
616
Py_NO_INLINE int
617
_PyJit_translate_single_bytecode_to_trace(
618
    PyThreadState *tstate,
619
    _PyInterpreterFrame *frame,
620
    _Py_CODEUNIT *next_instr,
621
    int stop_tracing_opcode)
622
{
623
624
#ifdef Py_DEBUG
625
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
626
    int lltrace = 0;
627
    if (python_lltrace != NULL && *python_lltrace >= '0') {
628
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
629
    }
630
#endif
631
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
632
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
633
    PyCodeObject *old_code = tracer->prev_state.instr_code;
634
    bool progress_needed = (tracer->initial_state.chain_depth % MAX_CHAIN_DEPTH) == 0;
635
    _PyJitUopBuffer *trace = &tracer->code_buffer;
636
637
    _Py_CODEUNIT *this_instr =  tracer->prev_state.instr;
638
    _Py_CODEUNIT *target_instr = this_instr;
639
    uint32_t target = 0;
640
641
    target = Py_IsNone((PyObject *)old_code)
642
        ? (uint32_t)(target_instr - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR)
643
        : INSTR_IP(target_instr, old_code);
644
645
    // Rewind EXTENDED_ARG so that we see the whole thing.
646
    // We must point to the first EXTENDED_ARG when deopting.
647
    int oparg = tracer->prev_state.instr_oparg;
648
    int opcode = this_instr->op.code;
649
    int rewind_oparg = oparg;
650
    while (rewind_oparg > 255) {
651
        rewind_oparg >>= 8;
652
        target--;
653
    }
654
655
    if (opcode == ENTER_EXECUTOR) {
656
        _PyExecutorObject *executor = old_code->co_executors->executors[oparg & 255];
657
        opcode = executor->vm_data.opcode;
658
        oparg = (oparg & ~255) | executor->vm_data.oparg;
659
    }
660
661
    if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
662
        uint16_t backoff = (this_instr + 1)->counter.value_and_backoff;
663
        // adaptive_counter_cooldown is a fresh specialization.
664
        // trigger_backoff_counter is what we set during tracing.
665
        // All tracing backoffs should be freshly specialized or untouched.
666
        // If not, that indicates a deopt during tracing, and
667
        // thus the "actual" instruction executed is not the one that is
668
        // in the instruction stream, but rather the deopt.
669
        // It's important we check for this, as some specializations might make
670
        // no progress (they can immediately deopt after specializing).
671
        // We do this to improve performance, as otherwise a compiled trace
672
        // will just deopt immediately.
673
        if (backoff != adaptive_counter_cooldown().value_and_backoff &&
674
            backoff != trigger_backoff_counter().value_and_backoff) {
675
            OPT_STAT_INC(trace_immediately_deopts);
676
            opcode = _PyOpcode_Deopt[opcode];
677
        }
678
    }
679
680
    // Strange control-flow
681
    bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) &&
682
        (next_instr != this_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]]);
683
684
    /* Special case the first instruction,
685
    * so that we can guarantee forward progress */
686
    if (progress_needed && uop_buffer_length(&tracer->code_buffer) < CODE_SIZE_NO_PROGRESS) {
687
        if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
688
            opcode = _PyOpcode_Deopt[opcode];
689
        }
690
        assert(!OPCODE_HAS_EXIT(opcode));
691
        assert(!OPCODE_HAS_DEOPT(opcode));
692
    }
693
694
    bool needs_guard_ip = OPCODE_HAS_NEEDS_GUARD_IP(opcode);
695
    if (has_dynamic_jump_taken && !needs_guard_ip) {
696
        DPRINTF(2, "Unsupported: dynamic jump taken %s\n", _PyOpcode_OpName[opcode]);
697
        goto unsupported;
698
    }
699
700
    int is_sys_tracing = (tstate->c_tracefunc != NULL) || (tstate->c_profilefunc != NULL);
701
    if (is_sys_tracing) {
702
        goto done;
703
    }
704
705
    if (stop_tracing_opcode == _DEOPT) {
706
        // gh-143183: It's important we rewind to the last known proper target.
707
        // The current target might be garbage as stop tracing usually indicates
708
        // we are in something that we can't trace.
709
        DPRINTF(2, "Told to stop tracing\n");
710
        goto unsupported;
711
    }
712
    else if (stop_tracing_opcode != 0) {
713
        assert(stop_tracing_opcode == _EXIT_TRACE);
714
        ADD_TO_TRACE(stop_tracing_opcode, 0, 0, target);
715
        goto done;
716
    }
717
718
    DPRINTF(2, "%p %d: %s(%d) %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip);
719
720
#ifdef Py_DEBUG
721
    if (oparg > 255) {
722
        assert(_Py_GetBaseCodeUnit(old_code, target).op.code == EXTENDED_ARG);
723
    }
724
#endif
725
726
    // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls
727
    // If we haven't guarded the IP, then it's untraceable.
728
    if (frame != tracer->prev_state.instr_frame && !needs_guard_ip) {
729
        DPRINTF(2, "Unsupported: unguardable jump taken\n");
730
        goto unsupported;
731
    }
732
733
    if (oparg > 0xFFFF) {
734
        DPRINTF(2, "Unsupported: oparg too large\n");
735
        unsupported:
736
        {
737
            // Rewind to previous instruction and replace with _EXIT_TRACE.
738
            _PyUOpInstruction *curr = uop_buffer_last(trace);
739
            while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
740
                trace->next--;
741
                curr = uop_buffer_last(trace);
742
            }
743
            assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
744
            if (curr->opcode == _SET_IP) {
745
                int32_t old_target = (int32_t)uop_get_target(curr);
746
                curr->opcode = _DEOPT;
747
                curr->format = UOP_FORMAT_TARGET;
748
                curr->target = old_target;
749
            }
750
            goto done;
751
        }
752
    }
753
754
    if (opcode == NOP) {
755
        return 1;
756
    }
757
758
    if (opcode == JUMP_FORWARD) {
759
        return 1;
760
    }
761
762
    if (opcode == EXTENDED_ARG) {
763
        return 1;
764
    }
765
766
    // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
767
    trace->end -= 2;
768
769
    const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
770
771
    assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
772
    assert(!_PyErr_Occurred(tstate));
773
774
775
    if (OPCODE_HAS_EXIT(opcode)) {
776
        // Make space for side exit
777
        trace->end--;
778
    }
779
    if (OPCODE_HAS_ERROR(opcode)) {
780
        // Make space for error stub
781
        trace->end--;
782
    }
783
    if (OPCODE_HAS_DEOPT(opcode)) {
784
        // Make space for side exit
785
        trace->end--;
786
    }
787
788
    // _GUARD_IP leads to an exit.
789
    trace->end -= needs_guard_ip;
790
791
    int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode));
792
    if (uop_buffer_remaining_space(trace) < space_needed) {
793
        DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
794
                space_needed, uop_buffer_remaining_space(trace));
795
        OPT_STAT_INC(trace_too_long);
796
        goto done;
797
    }
798
799
    ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
800
801
    if (!OPCODE_HAS_NO_SAVE_IP(opcode)) {
802
        ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)target_instr, target);
803
    }
804
805
    switch (opcode) {
806
        case POP_JUMP_IF_NONE:
807
        case POP_JUMP_IF_NOT_NONE:
808
        case POP_JUMP_IF_FALSE:
809
        case POP_JUMP_IF_TRUE:
810
        {
811
            _Py_CODEUNIT *computed_next_instr_without_modifiers = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
812
            _Py_CODEUNIT *computed_next_instr = computed_next_instr_without_modifiers + (computed_next_instr_without_modifiers->op.code == NOT_TAKEN);
813
            _Py_CODEUNIT *computed_jump_instr = computed_next_instr_without_modifiers + oparg;
814
            assert(next_instr == computed_next_instr || next_instr == computed_jump_instr);
815
            int jump_happened = target_instr[1].cache & 1;
816
            assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr));
817
            uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
818
            ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
819
            break;
820
        }
821
        case JUMP_BACKWARD_JIT:
822
            // This is possible as the JIT might have re-activated after it was disabled
823
        case JUMP_BACKWARD_NO_JIT:
824
        case JUMP_BACKWARD:
825
            ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
826
            _Py_FALLTHROUGH;
827
        case JUMP_BACKWARD_NO_INTERRUPT:
828
        {
829
            if ((next_instr != tracer->initial_state.close_loop_instr) &&
830
                (next_instr != tracer->initial_state.start_instr) &&
831
                uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS &&
832
                // For side exits, we don't want to terminate them early.
833
                tracer->initial_state.exit == NULL &&
834
                // These are coroutines, and we want to unroll those usually.
835
                opcode != JUMP_BACKWARD_NO_INTERRUPT) {
836
                // We encountered a JUMP_BACKWARD but not to the top of our own loop.
837
                // We don't want to continue tracing as we might get stuck in the
838
                // inner loop. Instead, end the trace where the executor of the
839
                // inner loop might start and let the traces rejoin.
840
                OPT_STAT_INC(inner_loop);
841
                ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
842
                uop_buffer_last(trace)->operand1 = true; // is_control_flow
843
                DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
844
                    tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr);
845
                goto done;
846
            }
847
            break;
848
        }
849
850
        case RESUME:
851
        case RESUME_CHECK:
852
        case RESUME_CHECK_JIT:
853
            /* Use a special tier 2 version of RESUME_CHECK to allow traces to
854
             *  start with RESUME_CHECK */
855
            ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
856
            break;
857
        default:
858
        {
859
            const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
860
            // Reserve space for nuops (+ _SET_IP + _EXIT_TRACE)
861
            int nuops = expansion->nuops;
862
            if (nuops == 0) {
863
                DPRINTF(2, "Unsupported opcode %s\n", _PyOpcode_OpName[opcode]);
864
                goto unsupported;
865
            }
866
            assert(nuops > 0);
867
            uint32_t orig_oparg = oparg;  // For OPARG_TOP/BOTTOM
868
            uint32_t orig_target = target;
869
            for (int i = 0; i < nuops; i++) {
870
                oparg = orig_oparg;
871
                target = orig_target;
872
                uint32_t uop = expansion->uops[i].uop;
873
                uint64_t operand = 0;
874
                // Add one to account for the actual opcode/oparg pair:
875
                int offset = expansion->uops[i].offset + 1;
876
                switch (expansion->uops[i].size) {
877
                    case OPARG_SIMPLE:
878
                        assert(opcode != _JUMP_BACKWARD_NO_INTERRUPT && opcode != JUMP_BACKWARD);
879
                        break;
880
                    case OPARG_CACHE_1:
881
                        operand = read_u16(&this_instr[offset].cache);
882
                        break;
883
                    case OPARG_CACHE_2:
884
                        operand = read_u32(&this_instr[offset].cache);
885
                        break;
886
                    case OPARG_CACHE_4:
887
                        operand = read_u64(&this_instr[offset].cache);
888
                        break;
889
                    case OPARG_TOP:  // First half of super-instr
890
                        assert(orig_oparg <= 255);
891
                        oparg = orig_oparg >> 4;
892
                        break;
893
                    case OPARG_BOTTOM:  // Second half of super-instr
894
                        assert(orig_oparg <= 255);
895
                        oparg = orig_oparg & 0xF;
896
                        break;
897
                    case OPARG_SAVE_RETURN_OFFSET:  // op=_SAVE_RETURN_OFFSET; oparg=return_offset
898
                        oparg = offset;
899
                        assert(uop == _SAVE_RETURN_OFFSET);
900
                        break;
901
                    case OPARG_REPLACED:
902
                        uop = _PyUOp_Replacements[uop];
903
                        assert(uop != 0);
904
905
                        uint32_t next_inst = target + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
906
                        if (uop == _TIER2_RESUME_CHECK) {
907
                            target = next_inst;
908
                        }
909
                        else {
910
                            int extended_arg = orig_oparg > 255;
911
                            uint32_t jump_target = next_inst + orig_oparg + extended_arg;
912
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target).op.code == END_FOR);
913
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target+1).op.code == POP_ITER);
914
                            if (is_for_iter_test[uop]) {
915
                                target = jump_target + 1;
916
                            }
917
                        }
918
                        break;
919
                    case OPERAND1_1:
920
                        assert(uop_buffer_last(trace)->opcode == uop);
921
                        operand = read_u16(&this_instr[offset].cache);
922
                        uop_buffer_last(trace)->operand1 = operand;
923
                        continue;
924
                    case OPERAND1_2:
925
                        assert(uop_buffer_last(trace)->opcode == uop);
926
                        operand = read_u32(&this_instr[offset].cache);
927
                        uop_buffer_last(trace)->operand1 = operand;
928
                        continue;
929
                    case OPERAND1_4:
930
                        assert(uop_buffer_last(trace)->opcode == uop);
931
                        operand = read_u64(&this_instr[offset].cache);
932
                        uop_buffer_last(trace)->operand1 = operand;
933
                        continue;
934
                    default:
935
                        fprintf(stderr,
936
                                "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n",
937
                                opcode, oparg, nuops, i,
938
                                expansion->uops[i].size,
939
                                expansion->uops[i].offset);
940
                        Py_FatalError("garbled expansion");
941
                }
942
                if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
943
                    assert(i + 1 == nuops);
944
                    _Py_CODEUNIT *next = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
945
                    assert(next->op.code == STORE_FAST);
946
                    operand = next->op.arg;
947
                }
948
                else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
949
                    PyObject *recorded_value = tracer->prev_state.recorded_value;
950
                    tracer->prev_state.recorded_value = NULL;
951
                    operand = (uintptr_t)recorded_value;
952
                }
953
                // All other instructions
954
                ADD_TO_TRACE(uop, oparg, operand, target);
955
            }
956
            break;
957
        }  // End default
958
959
    }  // End switch (opcode)
960
961
    if (needs_guard_ip) {
962
        int last_opcode = uop_buffer_last(trace)->opcode;
963
        uint16_t guard_ip = guard_ip_uop[last_opcode];
964
        if (guard_ip == 0) {
965
            DPRINTF(1, "Unknown uop needing guard ip %s\n", _PyOpcode_uop_name[last_opcode]);
966
            Py_UNREACHABLE();
967
        }
968
        PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable);
969
        Py_INCREF(code);
970
        ADD_TO_TRACE(_RECORD_CODE, 0, (uintptr_t)code, 0);
971
        ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0);
972
        if (PyCode_Check(code)) {
973
            /* Record stack depth, in operand1 */
974
            int stack_depth = (int)(frame->stackpointer - _PyFrame_Stackbase(frame));
975
            uop_buffer_last(trace)->operand1 = stack_depth;
976
            ADD_TO_TRACE(guard_code_version_uop[last_opcode], 0, ((PyCodeObject *)code)->co_version, 0);
977
        }
978
    }
979
    // Loop back to the start
980
    int is_first_instr = tracer->initial_state.close_loop_instr == next_instr ||
981
        tracer->initial_state.start_instr == next_instr;
982
    if (is_first_instr && uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
983
        if (needs_guard_ip) {
984
            ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0);
985
        }
986
        ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
987
        goto done;
988
    }
989
    DPRINTF(2, "Trace continuing\n");
990
    return 1;
991
done:
992
    DPRINTF(2, "Trace done\n");
993
    if (!is_terminator(uop_buffer_last(trace))) {
994
        ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
995
        uop_buffer_last(trace)->operand1 = true; // is_control_flow
996
    }
997
    return 0;
998
}
999
1000
// Returns 0 for do not enter tracing, 1 on enter tracing.
1001
// gh-142543: inlining this function causes stack overflows
1002
Py_NO_INLINE int
1003
_PyJit_TryInitializeTracing(
1004
    PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *curr_instr,
1005
    _Py_CODEUNIT *start_instr, _Py_CODEUNIT *close_loop_instr, _PyStackRef *stack_pointer, int chain_depth,
1006
    _PyExitData *exit, int oparg, _PyExecutorObject *current_executor)
1007
{
1008
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1009
    if (_tstate->jit_tracer_state == NULL) {
1010
        _tstate->jit_tracer_state = (_PyJitTracerState *)_PyObject_VirtualAlloc(sizeof(_PyJitTracerState));
1011
        if (_tstate->jit_tracer_state == NULL) {
1012
            // Don't error, just go to next instruction.
1013
            return 0;
1014
        }
1015
        _tstate->jit_tracer_state->is_tracing = false;
1016
    }
1017
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
1018
    // A recursive trace.
1019
    if (tracer->is_tracing) {
1020
        return 0;
1021
    }
1022
    if (oparg > 0xFFFF) {
1023
        return 0;
1024
    }
1025
    PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj);
1026
    if (func == NULL || !PyFunction_Check(func)) {
1027
        return 0;
1028
    }
1029
    PyCodeObject *code = _PyFrame_GetCode(frame);
1030
#ifdef Py_DEBUG
1031
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1032
    int lltrace = 0;
1033
    if (python_lltrace != NULL && *python_lltrace >= '0') {
1034
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
1035
    }
1036
    DPRINTF(2,
1037
        "Tracing %s (%s:%d) at byte offset %d at chain depth %d\n",
1038
        PyUnicode_AsUTF8(code->co_qualname),
1039
        PyUnicode_AsUTF8(code->co_filename),
1040
        code->co_firstlineno,
1041
        2 * INSTR_IP(close_loop_instr, code),
1042
        chain_depth);
1043
#endif
1044
    /* Set up tracing buffer*/
1045
    _PyJitUopBuffer *trace = &tracer->code_buffer;
1046
    uop_buffer_init(trace, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
1047
    ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)start_instr, INSTR_IP(start_instr, code));
1048
    ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
1049
1050
    tracer->initial_state.start_instr = start_instr;
1051
    tracer->initial_state.close_loop_instr = close_loop_instr;
1052
    tracer->initial_state.code = (PyCodeObject *)Py_NewRef(code);
1053
    tracer->initial_state.func = (PyFunctionObject *)Py_NewRef(func);
1054
    tracer->initial_state.executor = (_PyExecutorObject *)Py_XNewRef(current_executor);
1055
    tracer->initial_state.exit = exit;
1056
    tracer->initial_state.stack_depth = (int)(stack_pointer - _PyFrame_Stackbase(frame));
1057
    tracer->initial_state.chain_depth = chain_depth;
1058
    tracer->prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame));
1059
    tracer->prev_state.instr = curr_instr;
1060
    tracer->prev_state.instr_frame = frame;
1061
    tracer->prev_state.instr_oparg = oparg;
1062
    tracer->prev_state.instr_stacklevel = tracer->initial_state.stack_depth;
1063
    tracer->prev_state.recorded_value = NULL;
1064
    uint8_t record_func_index = _PyOpcode_RecordFunctionIndices[curr_instr->op.code];
1065
    if (record_func_index) {
1066
        _Py_RecordFuncPtr record_func = _PyOpcode_RecordFunctions[record_func_index];
1067
        record_func(frame, stack_pointer, oparg, &tracer->prev_state.recorded_value);
1068
    }
1069
    assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
1070
    tracer->initial_state.jump_backward_instr = curr_instr;
1071
1072
    tracer->is_tracing = true;
1073
    return 1;
1074
}
1075
1076
Py_NO_INLINE void
1077
_PyJit_FinalizeTracing(PyThreadState *tstate, int err)
1078
{
1079
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1080
    _PyJitTracerState *tracer = _tstate->jit_tracer_state;
1081
    // Deal with backoffs
1082
    assert(tracer != NULL);
1083
    _PyExitData *exit = tracer->initial_state.exit;
1084
    if (exit == NULL) {
1085
        // We hold a strong reference to the code object, so the instruction won't be freed.
1086
        if (err <= 0) {
1087
            _Py_BackoffCounter counter = tracer->initial_state.jump_backward_instr[1].counter;
1088
            tracer->initial_state.jump_backward_instr[1].counter = restart_backoff_counter(counter);
1089
        }
1090
        else {
1091
            if (tracer->initial_state.jump_backward_instr[0].op.code == JUMP_BACKWARD_JIT) {
1092
                tracer->initial_state.jump_backward_instr[1].counter = initial_jump_backoff_counter(&tstate->interp->opt_config);
1093
            }
1094
            else {
1095
                tracer->initial_state.jump_backward_instr[1].counter = initial_resume_backoff_counter(&tstate->interp->opt_config);
1096
            }
1097
        }
1098
    }
1099
    else if (tracer->initial_state.executor->vm_data.valid) {
1100
        // Likewise, we hold a strong reference to the executor containing this exit, so the exit is guaranteed
1101
        // to be valid to access.
1102
        if (err <= 0) {
1103
            exit->temperature = restart_backoff_counter(exit->temperature);
1104
        }
1105
        else {
1106
            exit->temperature = initial_temperature_backoff_counter(&tstate->interp->opt_config);
1107
        }
1108
    }
1109
    // Clear all recorded values
1110
    _PyJitUopBuffer *buffer = &tracer->code_buffer;
1111
    for (_PyUOpInstruction *inst = buffer->start; inst < buffer->next; inst++) {
1112
        if (_PyUop_Flags[inst->opcode] & HAS_RECORDS_VALUE_FLAG) {
1113
            Py_XDECREF((PyObject *)(uintptr_t)inst->operand0);
1114
        }
1115
    }
1116
    Py_CLEAR(tracer->initial_state.code);
1117
    Py_CLEAR(tracer->initial_state.func);
1118
    Py_CLEAR(tracer->initial_state.executor);
1119
    Py_CLEAR(tracer->prev_state.instr_code);
1120
    Py_CLEAR(tracer->prev_state.recorded_value);
1121
    uop_buffer_init(buffer, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH);
1122
    tracer->is_tracing = false;
1123
}
1124
1125
bool
1126
_PyJit_EnterExecutorShouldStopTracing(int og_opcode)
1127
{
1128
    // Continue tracing (skip over the executor). If it's a RESUME
1129
    // trace to form longer, more optimizeable traces.
1130
    // We want to trace over RESUME traces. Otherwise, functions with lots of RESUME
1131
    // end up with many fragmented traces which perform badly.
1132
    // See for example, the richards benchmark in pyperformance.
1133
    // For consideration: We may want to consider tracing over side traces
1134
    // inserted into bytecode as well in the future.
1135
    return og_opcode == RESUME_CHECK_JIT;
1136
}
1137
1138
void
1139
_PyJit_TracerFree(_PyThreadStateImpl *_tstate)
1140
{
1141
    if (_tstate->jit_tracer_state != NULL) {
1142
        _PyObject_VirtualFree(_tstate->jit_tracer_state, sizeof(_PyJitTracerState));
1143
        _tstate->jit_tracer_state = NULL;
1144
    }
1145
}
1146
1147
#undef RESERVE
1148
#undef INSTR_IP
1149
#undef ADD_TO_TRACE
1150
#undef DPRINTF
1151
1152
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
1153
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
1154
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
1155
1156
/* Count the number of unused uops and exits
1157
*/
1158
static int
1159
count_exits(_PyUOpInstruction *buffer, int length)
1160
{
1161
    int exit_count = 0;
1162
    for (int i = 0; i < length; i++) {
1163
        uint16_t base_opcode = _PyUop_Uncached[buffer[i].opcode];
1164
        if (base_opcode == _EXIT_TRACE || base_opcode == _DYNAMIC_EXIT) {
1165
            exit_count++;
1166
        }
1167
    }
1168
    return exit_count;
1169
}
1170
1171
/* The number of cached registers at any exit (`EXIT_IF` or `DEOPT_IF`)
1172
 * This is the number of cached at entries at start, unless the uop is
1173
 * marked as `exit_depth_is_output` in which case it is the number of
1174
 * cached entries at the end */
1175
static int
1176
get_cached_entries_for_side_exit(_PyUOpInstruction *inst)
1177
{
1178
    // Maybe add another generated table for this?
1179
    int base_opcode = _PyUop_Uncached[inst->opcode];
1180
    assert(base_opcode != 0);
1181
    for (int i = 0; i <= MAX_CACHED_REGISTER; i++) {
1182
        const _PyUopTOSentry *entry = &_PyUop_Caching[base_opcode].entries[i];
1183
        if (entry->opcode == inst->opcode) {
1184
            return entry->exit;
1185
        }
1186
    }
1187
    Py_UNREACHABLE();
1188
}
1189
1190
static void make_exit(_PyUOpInstruction *inst, int opcode, int target, bool is_control_flow)
1191
{
1192
    assert(opcode > MAX_UOP_ID && opcode <= MAX_UOP_REGS_ID);
1193
    inst->opcode = opcode;
1194
    inst->oparg = 0;
1195
    inst->operand0 = 0;
1196
    inst->format = UOP_FORMAT_TARGET;
1197
    inst->target = target;
1198
    inst->operand1 = is_control_flow;
1199
#ifdef Py_STATS
1200
    inst->execution_count = 0;
1201
#endif
1202
}
1203
1204
/* Convert implicit exits, errors and deopts
1205
 * into explicit ones. */
1206
static int
1207
prepare_for_execution(_PyUOpInstruction *buffer, int length)
1208
{
1209
    int32_t current_jump = -1;
1210
    int32_t current_jump_target = -1;
1211
    int32_t current_error = -1;
1212
    int32_t current_error_target = -1;
1213
    int32_t current_popped = -1;
1214
    int32_t current_exit_op = -1;
1215
    /* Leaving in NOPs slows down the interpreter and messes up the stats */
1216
    _PyUOpInstruction *copy_to = &buffer[0];
1217
    for (int i = 0; i < length; i++) {
1218
        _PyUOpInstruction *inst = &buffer[i];
1219
        if (inst->opcode != _NOP) {
1220
            if (copy_to != inst) {
1221
                *copy_to = *inst;
1222
            }
1223
            copy_to++;
1224
        }
1225
    }
1226
    length = (int)(copy_to - buffer);
1227
    int next_spare = length;
1228
    for (int i = 0; i < length; i++) {
1229
        _PyUOpInstruction *inst = &buffer[i];
1230
        int base_opcode = _PyUop_Uncached[inst->opcode];
1231
        assert(inst->opcode != _NOP);
1232
        int32_t target = (int32_t)uop_get_target(inst);
1233
        uint16_t exit_flags = _PyUop_Flags[base_opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG | HAS_PERIODIC_FLAG);
1234
        if (exit_flags) {
1235
            uint16_t base_exit_op = _EXIT_TRACE;
1236
            if (exit_flags & HAS_DEOPT_FLAG) {
1237
                base_exit_op = _DEOPT;
1238
            }
1239
            else if (exit_flags & HAS_PERIODIC_FLAG) {
1240
                base_exit_op = _HANDLE_PENDING_AND_DEOPT;
1241
            }
1242
            int32_t jump_target = target;
1243
            if (dynamic_exit_uop[base_opcode]) {
1244
                base_exit_op = _DYNAMIC_EXIT;
1245
            }
1246
            int exit_depth = get_cached_entries_for_side_exit(inst);
1247
            assert(_PyUop_Caching[base_exit_op].entries[exit_depth].opcode > 0);
1248
            int16_t exit_op = _PyUop_Caching[base_exit_op].entries[exit_depth].opcode;
1249
            bool is_control_flow = (base_opcode == _GUARD_IS_FALSE_POP || base_opcode == _GUARD_IS_TRUE_POP || is_for_iter_test[base_opcode]);
1250
            if (jump_target != current_jump_target || current_exit_op != exit_op) {
1251
                make_exit(&buffer[next_spare], exit_op, jump_target, is_control_flow);
1252
                current_exit_op = exit_op;
1253
                current_jump_target = jump_target;
1254
                current_jump = next_spare;
1255
                next_spare++;
1256
            }
1257
            buffer[i].jump_target = current_jump;
1258
            buffer[i].format = UOP_FORMAT_JUMP;
1259
        }
1260
        if (_PyUop_Flags[base_opcode] & HAS_ERROR_FLAG) {
1261
            int popped = (_PyUop_Flags[base_opcode] & HAS_ERROR_NO_POP_FLAG) ?
1262
                0 : _PyUop_num_popped(base_opcode, inst->oparg);
1263
            if (target != current_error_target || popped != current_popped) {
1264
                current_popped = popped;
1265
                current_error = next_spare;
1266
                current_error_target = target;
1267
                make_exit(&buffer[next_spare], _ERROR_POP_N_r00, 0, false);
1268
                buffer[next_spare].operand0 = target;
1269
                next_spare++;
1270
            }
1271
            buffer[i].error_target = current_error;
1272
            if (buffer[i].format == UOP_FORMAT_TARGET) {
1273
                buffer[i].format = UOP_FORMAT_JUMP;
1274
                buffer[i].jump_target = 0;
1275
            }
1276
        }
1277
        if (base_opcode == _JUMP_TO_TOP) {
1278
            assert(_PyUop_Uncached[buffer[0].opcode] == _START_EXECUTOR);
1279
            buffer[i].format = UOP_FORMAT_JUMP;
1280
            buffer[i].jump_target = 1;
1281
        }
1282
    }
1283
    return next_spare;
1284
}
1285
1286
/* Executor side exits */
1287
1288
static _PyExecutorObject *
1289
allocate_executor(int exit_count, int length)
1290
{
1291
    int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
1292
    _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
1293
    if (res == NULL) {
1294
        return NULL;
1295
    }
1296
    res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
1297
    res->code_size = length;
1298
    res->exit_count = exit_count;
1299
    return res;
1300
}
1301
1302
#ifdef Py_DEBUG
1303
1304
#define CHECK(PRED) \
1305
if (!(PRED)) { \
1306
    printf(#PRED " at %d\n", i); \
1307
    assert(0); \
1308
}
1309
1310
static int
1311
target_unused(int opcode)
1312
{
1313
    return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0;
1314
}
1315
1316
static void
1317
sanity_check(_PyExecutorObject *executor)
1318
{
1319
    for (uint32_t i = 0; i < executor->exit_count; i++) {
1320
        _PyExitData *exit = &executor->exits[i];
1321
        CHECK(exit->target < (1 << 25));
1322
    }
1323
    bool ended = false;
1324
    uint32_t i = 0;
1325
    CHECK(_PyUop_Uncached[executor->trace[0].opcode] == _START_EXECUTOR ||
1326
        _PyUop_Uncached[executor->trace[0].opcode] == _COLD_EXIT ||
1327
        _PyUop_Uncached[executor->trace[0].opcode] == _COLD_DYNAMIC_EXIT);
1328
    for (; i < executor->code_size; i++) {
1329
        const _PyUOpInstruction *inst = &executor->trace[i];
1330
        uint16_t opcode = inst->opcode;
1331
        uint16_t base_opcode = _PyUop_Uncached[opcode];
1332
        CHECK(opcode > MAX_UOP_ID);
1333
        CHECK(opcode <= MAX_UOP_REGS_ID);
1334
        CHECK(base_opcode <= MAX_UOP_ID);
1335
        CHECK(base_opcode != 0);
1336
        switch(inst->format) {
1337
            case UOP_FORMAT_TARGET:
1338
                CHECK(target_unused(base_opcode));
1339
                break;
1340
            case UOP_FORMAT_JUMP:
1341
                CHECK(inst->jump_target < executor->code_size);
1342
                break;
1343
        }
1344
        if (_PyUop_Flags[base_opcode] & HAS_ERROR_FLAG) {
1345
            CHECK(inst->format == UOP_FORMAT_JUMP);
1346
            CHECK(inst->error_target < executor->code_size);
1347
        }
1348
        if (is_terminator(inst)) {
1349
            ended = true;
1350
            i++;
1351
            break;
1352
        }
1353
    }
1354
    CHECK(ended);
1355
    for (; i < executor->code_size; i++) {
1356
        const _PyUOpInstruction *inst = &executor->trace[i];
1357
        uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
1358
        CHECK(
1359
            base_opcode == _DEOPT ||
1360
            base_opcode == _HANDLE_PENDING_AND_DEOPT ||
1361
            base_opcode == _EXIT_TRACE ||
1362
            base_opcode == _ERROR_POP_N ||
1363
            base_opcode == _DYNAMIC_EXIT);
1364
    }
1365
}
1366
1367
#undef CHECK
1368
#endif
1369
1370
/* Makes an executor from a buffer of uops.
1371
 * Account for the buffer having gaps and NOPs by computing a "used"
1372
 * bit vector and only copying the used uops. Here "used" means reachable
1373
 * and not a NOP.
1374
 */
1375
static _PyExecutorObject *
1376
make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies)
1377
{
1378
    int exit_count = count_exits(buffer, length);
1379
    _PyExecutorObject *executor = allocate_executor(exit_count, length);
1380
    if (executor == NULL) {
1381
        return NULL;
1382
    }
1383
1384
    /* Initialize exits */
1385
    int chain_depth = tstate->jit_tracer_state->initial_state.chain_depth;
1386
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
1387
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
1388
    cold->vm_data.chain_depth = chain_depth;
1389
    PyInterpreterState *interp = tstate->base.interp;
1390
    for (int i = 0; i < exit_count; i++) {
1391
        executor->exits[i].index = i;
1392
        executor->exits[i].temperature = initial_temperature_backoff_counter(&interp->opt_config);
1393
    }
1394
    int next_exit = exit_count-1;
1395
    _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
1396
    assert(_PyUop_Uncached[buffer[0].opcode] == _START_EXECUTOR);
1397
    buffer[0].operand0 = (uint64_t)executor;
1398
    for (int i = length-1; i >= 0; i--) {
1399
        uint16_t base_opcode = _PyUop_Uncached[buffer[i].opcode];
1400
        dest--;
1401
        *dest = buffer[i];
1402
        if (base_opcode == _EXIT_TRACE || base_opcode == _DYNAMIC_EXIT) {
1403
            _PyExitData *exit = &executor->exits[next_exit];
1404
            exit->target = buffer[i].target;
1405
            dest->operand0 = (uint64_t)exit;
1406
            exit->executor = base_opcode == _EXIT_TRACE ? cold : cold_dynamic;
1407
            exit->is_dynamic = (char)(base_opcode == _DYNAMIC_EXIT);
1408
            exit->is_control_flow = (char)buffer[i].operand1;
1409
            next_exit--;
1410
        }
1411
    }
1412
    assert(next_exit == -1);
1413
    assert(dest == executor->trace);
1414
    assert(_PyUop_Uncached[dest->opcode] == _START_EXECUTOR);
1415
    // Note: we MUST track it here before any Py_DECREF(executor) or
1416
    // linking of executor. Otherwise, the GC tries to untrack a
1417
    // still untracked object during dealloc.
1418
    _PyObject_GC_TRACK(executor);
1419
    if (_Py_ExecutorInit(executor, dependencies) < 0) {
1420
        Py_DECREF(executor);
1421
        return NULL;
1422
    }
1423
#ifdef Py_DEBUG
1424
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1425
    int lltrace = 0;
1426
    if (python_lltrace != NULL && *python_lltrace >= '0') {
1427
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
1428
    }
1429
    if (lltrace >= 2) {
1430
        printf("Optimized trace (length %d):\n", length);
1431
        for (int i = 0; i < length; i++) {
1432
            printf("%4d OPTIMIZED: ", i);
1433
            _PyUOpPrint(&executor->trace[i]);
1434
            printf("\n");
1435
        }
1436
    }
1437
    sanity_check(executor);
1438
#endif
1439
#ifdef _Py_JIT
1440
    executor->jit_code = NULL;
1441
    executor->jit_size = 0;
1442
    // This is initialized to false so we can prevent the executor
1443
    // from being immediately detected as cold and invalidated.
1444
    executor->vm_data.cold = false;
1445
    if (_PyJIT_Compile(executor, executor->trace, length)) {
1446
        Py_DECREF(executor);
1447
        return NULL;
1448
    }
1449
#endif
1450
    return executor;
1451
}
1452
1453
#ifdef Py_STATS
1454
/* Returns the effective trace length.
1455
 * Ignores NOPs and trailing exit and error handling.*/
1456
int effective_trace_length(_PyUOpInstruction *buffer, int length)
1457
{
1458
    int nop_count = 0;
1459
    for (int i = 0; i < length; i++) {
1460
        int opcode = buffer[i].opcode;
1461
        if (opcode == _NOP) {
1462
            nop_count++;
1463
        }
1464
        if (is_terminator(&buffer[i])) {
1465
            return i+1-nop_count;
1466
        }
1467
    }
1468
    Py_FatalError("No terminating instruction");
1469
    Py_UNREACHABLE();
1470
}
1471
#endif
1472
1473
1474
static int
1475
stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int length)
1476
{
1477
    assert(buffer[0].opcode == _START_EXECUTOR);
1478
    /* The input buffer and output buffers will overlap.
1479
       Make sure that we can move instructions to the output
1480
       without overwriting the input. */
1481
    if (buffer == output) {
1482
        // This can only happen if optimizer has not been run
1483
        for (int i = 0; i < length; i++) {
1484
            buffer[i + UOP_MAX_TRACE_LENGTH] = buffer[i];
1485
        }
1486
        buffer += UOP_MAX_TRACE_LENGTH;
1487
    }
1488
    else {
1489
        assert(output + UOP_MAX_TRACE_LENGTH == buffer);
1490
    }
1491
    int depth = 0;
1492
    _PyUOpInstruction *write = output;
1493
    for (int i = 0; i < length; i++) {
1494
        int uop = buffer[i].opcode;
1495
        if (uop == _NOP) {
1496
            continue;
1497
        }
1498
        int new_depth = _PyUop_Caching[uop].best[depth];
1499
        if (new_depth != depth) {
1500
            write->opcode = _PyUop_SpillsAndReloads[depth][new_depth];
1501
            assert(write->opcode != 0);
1502
            write->format = UOP_FORMAT_TARGET;
1503
            write->oparg = 0;
1504
            write->target = 0;
1505
            write++;
1506
            depth = new_depth;
1507
        }
1508
        *write = buffer[i];
1509
        uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode;
1510
        assert(new_opcode != 0);
1511
        write->opcode = new_opcode;
1512
        write++;
1513
        depth = _PyUop_Caching[uop].entries[depth].output;
1514
    }
1515
    return (int)(write - output);
1516
}
1517
1518
static int
1519
uop_optimize(
1520
    _PyInterpreterFrame *frame,
1521
    PyThreadState *tstate,
1522
    _PyExecutorObject **exec_ptr,
1523
    bool progress_needed)
1524
{
1525
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1526
    assert(_tstate->jit_tracer_state != NULL);
1527
    _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start;
1528
    OPT_STAT_INC(attempts);
1529
    bool is_noopt = !tstate->interp->opt_config.uops_optimize_enabled;
1530
    int curr_stackentries = _tstate->jit_tracer_state->initial_state.stack_depth;
1531
    int length = uop_buffer_length(&_tstate->jit_tracer_state->code_buffer);
1532
    if (length <= CODE_SIZE_NO_PROGRESS) {
1533
        return 0;
1534
    }
1535
    assert(length > 0);
1536
    assert(length < UOP_MAX_TRACE_LENGTH);
1537
    OPT_STAT_INC(traces_created);
1538
1539
    _PyBloomFilter dependencies;
1540
    _Py_BloomFilter_Init(&dependencies);
1541
    if (!is_noopt) {
1542
        _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH];
1543
        length = _Py_uop_analyze_and_optimize(
1544
            _tstate, buffer, length, curr_stackentries,
1545
            output, &dependencies);
1546
1547
        if (length <= 0) {
1548
            return length;
1549
        }
1550
        buffer = output;
1551
    }
1552
    assert(length < UOP_MAX_TRACE_LENGTH);
1553
    assert(length >= 1);
1554
    /* Fix up */
1555
    for (int pc = 0; pc < length; pc++) {
1556
        int opcode = buffer[pc].opcode;
1557
        int oparg = buffer[pc].oparg;
1558
        if (oparg < _PyUop_Replication[opcode].stop && oparg >= _PyUop_Replication[opcode].start) {
1559
            buffer[pc].opcode = opcode + oparg + 1 - _PyUop_Replication[opcode].start;
1560
            assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0);
1561
        }
1562
        else if (_PyUop_Flags[opcode] & HAS_RECORDS_VALUE_FLAG) {
1563
            Py_XDECREF((PyObject *)(uintptr_t)buffer[pc].operand0);
1564
            buffer[pc].opcode = _NOP;
1565
        }
1566
        else if (is_terminator(&buffer[pc])) {
1567
            break;
1568
        }
1569
        assert(_PyOpcode_uop_name[buffer[pc].opcode]);
1570
    }
1571
    // We've cleaned up the references in the buffer, so discard the code buffer
1572
    // to avoid doing it again during tracer cleanup
1573
    _PyJitUopBuffer *code_buffer = &_tstate->jit_tracer_state->code_buffer;
1574
    code_buffer->next = code_buffer->start;
1575
1576
    OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
1577
    _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0];
1578
    length = stack_allocate(buffer, output, length);
1579
    buffer = output;
1580
    length = prepare_for_execution(buffer, length);
1581
    assert(length <= UOP_MAX_TRACE_LENGTH);
1582
    _PyExecutorObject *executor = make_executor_from_uops(
1583
        _tstate, buffer, length, &dependencies);
1584
    if (executor == NULL) {
1585
        return -1;
1586
    }
1587
    assert(length <= UOP_MAX_TRACE_LENGTH);
1588
1589
    // Check executor coldness
1590
    // It's okay if this ends up going negative.
1591
    if (--tstate->interp->executor_creation_counter == 0) {
1592
        _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
1593
    }
1594
1595
    *exec_ptr = executor;
1596
    return 1;
1597
}
1598
1599
1600
/*****************************************
1601
 *        Executor management
1602
 ****************************************/
1603
1604
static int
1605
link_executor(_PyExecutorObject *executor, const _PyBloomFilter *bloom)
1606
{
1607
    PyInterpreterState *interp = _PyInterpreterState_GET();
1608
    if (interp->executor_count == interp->executor_capacity) {
1609
        size_t new_cap = interp->executor_capacity ? interp->executor_capacity * 2 : 64;
1610
        _PyBloomFilter *new_blooms = PyMem_Realloc(
1611
            interp->executor_blooms, new_cap * sizeof(_PyBloomFilter));
1612
        if (new_blooms == NULL) {
1613
            return -1;
1614
        }
1615
        _PyExecutorObject **new_ptrs = PyMem_Realloc(
1616
            interp->executor_ptrs, new_cap * sizeof(_PyExecutorObject *));
1617
        if (new_ptrs == NULL) {
1618
            /* Revert blooms realloc — the old pointer may have been freed by
1619
             * a successful realloc, but new_blooms is the valid pointer. */
1620
            interp->executor_blooms = new_blooms;
1621
            return -1;
1622
        }
1623
        interp->executor_blooms = new_blooms;
1624
        interp->executor_ptrs = new_ptrs;
1625
        interp->executor_capacity = new_cap;
1626
    }
1627
    size_t idx = interp->executor_count++;
1628
    interp->executor_blooms[idx] = *bloom;
1629
    interp->executor_ptrs[idx] = executor;
1630
    executor->vm_data.bloom_array_idx = (int32_t)idx;
1631
    return 0;
1632
}
1633
1634
static void
1635
unlink_executor(_PyExecutorObject *executor)
1636
{
1637
    PyInterpreterState *interp = PyInterpreterState_Get();
1638
    int32_t idx = executor->vm_data.bloom_array_idx;
1639
    assert(idx >= 0 && (size_t)idx < interp->executor_count);
1640
    size_t last = --interp->executor_count;
1641
    if ((size_t)idx != last) {
1642
        /* Swap-remove: move the last element into the vacated slot */
1643
        interp->executor_blooms[idx] = interp->executor_blooms[last];
1644
        interp->executor_ptrs[idx] = interp->executor_ptrs[last];
1645
        interp->executor_ptrs[idx]->vm_data.bloom_array_idx = idx;
1646
    }
1647
    executor->vm_data.bloom_array_idx = -1;
1648
}
1649
1650
/* This must be called by optimizers before using the executor */
1651
int
1652
_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
1653
{
1654
    executor->vm_data.valid = true;
1655
    executor->vm_data.pending_deletion = 0;
1656
    executor->vm_data.code = NULL;
1657
    if (link_executor(executor, dependency_set) < 0) {
1658
        return -1;
1659
    }
1660
    return 0;
1661
}
1662
1663
static _PyExecutorObject *
1664
make_cold_executor(uint16_t opcode)
1665
{
1666
    _PyExecutorObject *cold = allocate_executor(0, 1);
1667
    if (cold == NULL) {
1668
        Py_FatalError("Cannot allocate core JIT code");
1669
    }
1670
    ((_PyUOpInstruction *)cold->trace)->opcode = opcode;
1671
    // This is initialized to false so we can prevent the executor
1672
    // from being immediately detected as cold and invalidated.
1673
    cold->vm_data.cold = false;
1674
#ifdef _Py_JIT
1675
    cold->jit_code = NULL;
1676
    cold->jit_size = 0;
1677
    if (_PyJIT_Compile(cold, cold->trace, 1)) {
1678
        Py_DECREF(cold);
1679
        Py_FatalError("Cannot allocate core JIT code");
1680
    }
1681
#endif
1682
    _Py_SetImmortal((PyObject *)cold);
1683
    return cold;
1684
}
1685
1686
_PyExecutorObject *
1687
_PyExecutor_GetColdExecutor(void)
1688
{
1689
    PyInterpreterState *interp = _PyInterpreterState_GET();
1690
    if (interp->cold_executor == NULL) {
1691
        return interp->cold_executor = make_cold_executor(_COLD_EXIT_r00);;
1692
    }
1693
    return interp->cold_executor;
1694
}
1695
1696
_PyExecutorObject *
1697
_PyExecutor_GetColdDynamicExecutor(void)
1698
{
1699
    PyInterpreterState *interp = _PyInterpreterState_GET();
1700
    if (interp->cold_dynamic_executor == NULL) {
1701
        interp->cold_dynamic_executor = make_cold_executor(_COLD_DYNAMIC_EXIT_r00);
1702
    }
1703
    return interp->cold_dynamic_executor;
1704
}
1705
1706
void
1707
_PyExecutor_ClearExit(_PyExitData *exit)
1708
{
1709
    if (exit == NULL) {
1710
        return;
1711
    }
1712
    _PyExecutorObject *old = exit->executor;
1713
    if (exit->is_dynamic) {
1714
        exit->executor = _PyExecutor_GetColdDynamicExecutor();
1715
    }
1716
    else {
1717
        exit->executor = _PyExecutor_GetColdExecutor();
1718
    }
1719
    Py_DECREF(old);
1720
}
1721
1722
/* Detaches the executor from the code object (if any) that
1723
 * holds a reference to it */
1724
void
1725
_Py_ExecutorDetach(_PyExecutorObject *executor)
1726
{
1727
    PyCodeObject *code = executor->vm_data.code;
1728
    if (code == NULL) {
1729
        return;
1730
    }
1731
    _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
1732
    assert(instruction->op.code == ENTER_EXECUTOR);
1733
    int index = instruction->op.arg;
1734
    assert(code->co_executors->executors[index] == executor);
1735
    instruction->op.code = _PyOpcode_Deopt[executor->vm_data.opcode];
1736
    instruction->op.arg = executor->vm_data.oparg;
1737
    executor->vm_data.code = NULL;
1738
    code->co_executors->executors[index] = NULL;
1739
    Py_DECREF(executor);
1740
}
1741
1742
/* Executors can be invalidated at any time,
1743
   even with a stop-the-world lock held.
1744
   Consequently it must not run arbitrary code,
1745
   including Py_DECREF with a non-executor. */
1746
static void
1747
executor_invalidate(PyObject *op)
1748
{
1749
    _PyExecutorObject *executor = _PyExecutorObject_CAST(op);
1750
    if (!executor->vm_data.valid) {
1751
        return;
1752
    }
1753
    executor->vm_data.valid = 0;
1754
    unlink_executor(executor);
1755
    executor_clear_exits(executor);
1756
    _Py_ExecutorDetach(executor);
1757
    _PyObject_GC_UNTRACK(op);
1758
}
1759
1760
static int
1761
executor_clear(PyObject *op)
1762
{
1763
    executor_invalidate(op);
1764
    return 0;
1765
}
1766
1767
void
1768
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
1769
{
1770
    assert(executor->vm_data.valid);
1771
    PyInterpreterState *interp = _PyInterpreterState_GET();
1772
    int32_t idx = executor->vm_data.bloom_array_idx;
1773
    assert(idx >= 0 && (size_t)idx < interp->executor_count);
1774
    _Py_BloomFilter_Add(&interp->executor_blooms[idx], obj);
1775
}
1776
1777
/* Invalidate all executors that depend on `obj`
1778
 * May cause other executors to be invalidated as well.
1779
 * Uses contiguous bloom filter array for cache-friendly scanning.
1780
 */
1781
void
1782
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
1783
{
1784
    _PyBloomFilter obj_filter;
1785
    _Py_BloomFilter_Init(&obj_filter);
1786
    _Py_BloomFilter_Add(&obj_filter, obj);
1787
    /* Scan contiguous bloom filter array */
1788
    PyObject *invalidate = PyList_New(0);
1789
    if (invalidate == NULL) {
1790
        goto error;
1791
    }
1792
    /* Clearing an executor can clear others, so we need to make a list of
1793
     * executors to invalidate first */
1794
    for (size_t i = 0; i < interp->executor_count; i++) {
1795
        assert(interp->executor_ptrs[i]->vm_data.valid);
1796
        if (bloom_filter_may_contain(&interp->executor_blooms[i], &obj_filter) &&
1797
            PyList_Append(invalidate, (PyObject *)interp->executor_ptrs[i]))
1798
        {
1799
            goto error;
1800
        }
1801
    }
1802
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1803
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1804
        executor_invalidate(exec);
1805
        if (is_invalidation) {
1806
            OPT_STAT_INC(executors_invalidated);
1807
        }
1808
    }
1809
    Py_DECREF(invalidate);
1810
    return;
1811
error:
1812
    PyErr_Clear();
1813
    Py_XDECREF(invalidate);
1814
    // If we're truly out of memory, wiping out everything is a fine fallback:
1815
    _Py_Executors_InvalidateAll(interp, is_invalidation);
1816
}
1817
1818
/* Invalidate all executors */
1819
void
1820
_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
1821
{
1822
    while (interp->executor_count > 0) {
1823
        /* Invalidate from the end to avoid repeated swap-remove shifts */
1824
        _PyExecutorObject *executor = interp->executor_ptrs[interp->executor_count - 1];
1825
        assert(executor->vm_data.valid);
1826
        if (executor->vm_data.code) {
1827
            // Clear the entire code object so its co_executors array be freed:
1828
            _PyCode_Clear_Executors(executor->vm_data.code);
1829
        }
1830
        else {
1831
            executor_invalidate((PyObject *)executor);
1832
        }
1833
        if (is_invalidation) {
1834
            OPT_STAT_INC(executors_invalidated);
1835
        }
1836
    }
1837
}
1838
1839
void
1840
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
1841
{
1842
    /* Scan contiguous executor array */
1843
    PyObject *invalidate = PyList_New(0);
1844
    if (invalidate == NULL) {
1845
        goto error;
1846
    }
1847
1848
    /* Clearing an executor can deallocate others, so we need to make a list of
1849
     * executors to invalidate first */
1850
    for (size_t i = 0; i < interp->executor_count; i++) {
1851
        _PyExecutorObject *exec = interp->executor_ptrs[i];
1852
        assert(exec->vm_data.valid);
1853
1854
        if (exec->vm_data.cold && PyList_Append(invalidate, (PyObject *)exec) < 0) {
1855
            goto error;
1856
        }
1857
        else {
1858
            exec->vm_data.cold = true;
1859
        }
1860
    }
1861
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1862
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1863
        executor_invalidate(exec);
1864
    }
1865
    Py_DECREF(invalidate);
1866
    return;
1867
error:
1868
    PyErr_Clear();
1869
    Py_XDECREF(invalidate);
1870
    // If we're truly out of memory, wiping out everything is a fine fallback
1871
    _Py_Executors_InvalidateAll(interp, 0);
1872
}
1873
1874
#include "record_functions.c.h"
1875
1876
static int
1877
escape_angles(const char *input, Py_ssize_t size, char *buffer) {
1878
    int written = 0;
1879
    for (Py_ssize_t i = 0; i < size; i++) {
1880
        char c = input[i];
1881
        if (c == '<' || c == '>') {
1882
            buffer[written++] = '&';
1883
            buffer[written++] = c == '>' ? 'g' : 'l';
1884
            buffer[written++] = 't';
1885
            buffer[written++] = ';';
1886
        }
1887
        else {
1888
            buffer[written++] = c;
1889
        }
1890
    }
1891
    return written;
1892
}
1893
1894
static void
1895
write_str(PyObject *str, FILE *out)
1896
{
1897
    // Encode the Unicode object to the specified encoding
1898
    PyObject *encoded_obj = PyUnicode_AsEncodedString(str, "utf8", "strict");
1899
    if (encoded_obj == NULL) {
1900
        PyErr_Clear();
1901
        return;
1902
    }
1903
    const char *encoded_str = PyBytes_AsString(encoded_obj);
1904
    Py_ssize_t encoded_size = PyBytes_Size(encoded_obj);
1905
    char buffer[120];
1906
    bool truncated = false;
1907
    if (encoded_size > 24) {
1908
        encoded_size = 24;
1909
        truncated = true;
1910
    }
1911
    int size = escape_angles(encoded_str, encoded_size, buffer);
1912
    fwrite(buffer, 1, size, out);
1913
    if (truncated) {
1914
        fwrite("...", 1, 3, out);
1915
    }
1916
    Py_DECREF(encoded_obj);
1917
}
1918
1919
static int
1920
find_line_number(PyCodeObject *code, _PyExecutorObject *executor)
1921
{
1922
    int code_len = (int)Py_SIZE(code);
1923
    for (int i = 0; i < code_len; i++) {
1924
        _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
1925
        int opcode = instr->op.code;
1926
        if (opcode == ENTER_EXECUTOR) {
1927
            _PyExecutorObject *exec = code->co_executors->executors[instr->op.arg];
1928
            if (exec == executor) {
1929
                return PyCode_Addr2Line(code, i*2);
1930
            }
1931
        }
1932
        i += _PyOpcode_Caches[_Py_GetBaseCodeUnit(code, i).op.code];
1933
    }
1934
    return -1;
1935
}
1936
1937
#define RED "#ff0000"
1938
#define WHITE "#ffffff"
1939
#define BLUE "#0000ff"
1940
#define BLACK "#000000"
1941
#define LOOP "#00c000"
1942
1943
#ifdef Py_STATS
1944
1945
static const char *COLORS[10] = {
1946
    "9",
1947
    "8",
1948
    "7",
1949
    "6",
1950
    "5",
1951
    "4",
1952
    "3",
1953
    "2",
1954
    "1",
1955
    WHITE,
1956
};
1957
const char *
1958
get_background_color(_PyUOpInstruction const *inst, uint64_t max_hotness)
1959
{
1960
    uint64_t hotness = inst->execution_count;
1961
    int index = (hotness * 10)/max_hotness;
1962
    if (index > 9) {
1963
        index = 9;
1964
    }
1965
    if (index < 0) {
1966
        index = 0;
1967
    }
1968
    return COLORS[index];
1969
}
1970
1971
const char *
1972
get_foreground_color(_PyUOpInstruction const *inst, uint64_t max_hotness)
1973
{
1974
    if(_PyUop_Uncached[inst->opcode] == _DEOPT) {
1975
        return RED;
1976
    }
1977
    uint64_t hotness = inst->execution_count;
1978
    int index = (hotness * 10)/max_hotness;
1979
    if (index > 3) {
1980
        return BLACK;
1981
    }
1982
    return WHITE;
1983
}
1984
#endif
1985
1986
static void
1987
write_row_for_uop(_PyExecutorObject *executor, uint32_t i, FILE *out)
1988
{
1989
    /* Write row for uop.
1990
        * The `port` is a marker so that outgoing edges can
1991
        * be placed correctly. If a row is marked `port=17`,
1992
        * then the outgoing edge is `{EXEC_NAME}:17 -> {TARGET}`
1993
        * https://graphviz.readthedocs.io/en/stable/manual.html#node-ports-compass
1994
        */
1995
    _PyUOpInstruction const *inst = &executor->trace[i];
1996
    const char *opname = _PyOpcode_uop_name[inst->opcode];
1997
#ifdef Py_STATS
1998
    const char *bg_color = get_background_color(inst, executor->trace[0].execution_count);
1999
    const char *color = get_foreground_color(inst, executor->trace[0].execution_count);
2000
    fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" color=\"%s\" bgcolor=\"%s\" ><font color=\"%s\"> %s &nbsp;--&nbsp; %" PRIu64 "</font></td></tr>\n",
2001
        i, color, bg_color, color, opname, inst->execution_count);
2002
#else
2003
    const char *color = (_PyUop_Uncached[inst->opcode] == _DEOPT) ? RED : BLACK;
2004
    fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" color=\"%s\" >%s op0=%" PRIu64 "</td></tr>\n", i, color, opname, inst->operand0);
2005
#endif
2006
}
2007
2008
static bool
2009
is_stop(_PyUOpInstruction const *inst)
2010
{
2011
    uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
2012
    return (base_opcode == _EXIT_TRACE || base_opcode == _DEOPT || base_opcode == _JUMP_TO_TOP);
2013
}
2014
2015
2016
/* Writes the node and outgoing edges for a single tracelet in graphviz format.
2017
 * Each tracelet is presented as a table of the uops it contains.
2018
 * If Py_STATS is enabled, execution counts are included.
2019
 *
2020
 * https://graphviz.readthedocs.io/en/stable/manual.html
2021
 * https://graphviz.org/gallery/
2022
 */
2023
static void
2024
executor_to_gv(_PyExecutorObject *executor, FILE *out)
2025
{
2026
    PyCodeObject *code = executor->vm_data.code;
2027
    fprintf(out, "executor_%p [\n", executor);
2028
    fprintf(out, "    shape = none\n");
2029
2030
    /* Write the HTML table for the uops */
2031
    fprintf(out, "    label = <<table border=\"0\" cellspacing=\"0\">\n");
2032
    fprintf(out, "        <tr><td port=\"start\" border=\"1\" ><b>Executor</b></td></tr>\n");
2033
    if (code == NULL) {
2034
        fprintf(out, "        <tr><td border=\"1\" >No code object</td></tr>\n");
2035
    }
2036
    else {
2037
        fprintf(out, "        <tr><td  border=\"1\" >");
2038
        write_str(code->co_qualname, out);
2039
        int line = find_line_number(code, executor);
2040
        fprintf(out, ": %d</td></tr>\n", line);
2041
    }
2042
    for (uint32_t i = 0; i < executor->code_size; i++) {
2043
        write_row_for_uop(executor, i, out);
2044
        if (is_stop(&executor->trace[i])) {
2045
            break;
2046
        }
2047
    }
2048
    fprintf(out, "    </table>>\n");
2049
    fprintf(out, "]\n\n");
2050
2051
    /* Write all the outgoing edges */
2052
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
2053
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
2054
    for (uint32_t i = 0; i < executor->code_size; i++) {
2055
        _PyUOpInstruction const *inst = &executor->trace[i];
2056
        uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
2057
        uint16_t flags = _PyUop_Flags[base_opcode];
2058
        _PyExitData *exit = NULL;
2059
        if (base_opcode == _JUMP_TO_TOP) {
2060
            fprintf(out, "executor_%p:i%d -> executor_%p:i%d [color = \"" LOOP "\"]\n", executor, i, executor, inst->jump_target);
2061
            break;
2062
        }
2063
        if (base_opcode == _EXIT_TRACE) {
2064
            exit = (_PyExitData *)inst->operand0;
2065
        }
2066
        else if (flags & HAS_EXIT_FLAG) {
2067
            assert(inst->format == UOP_FORMAT_JUMP);
2068
            _PyUOpInstruction const *exit_inst = &executor->trace[inst->jump_target];
2069
            uint16_t base_exit_opcode = _PyUop_Uncached[exit_inst->opcode];
2070
            (void)base_exit_opcode;
2071
            assert(base_exit_opcode == _EXIT_TRACE || base_exit_opcode == _DYNAMIC_EXIT);
2072
            exit = (_PyExitData *)exit_inst->operand0;
2073
        }
2074
        if (exit != NULL) {
2075
            if (exit->executor == cold || exit->executor == cold_dynamic) {
2076
#ifdef Py_STATS
2077
                /* Only mark as have cold exit if it has actually exited */
2078
                uint64_t diff = inst->execution_count - executor->trace[i+1].execution_count;
2079
                if (diff) {
2080
                    fprintf(out, "cold_%p%d [ label = \"%"  PRIu64  "\" shape = ellipse color=\"" BLUE "\" ]\n", executor, i, diff);
2081
                    fprintf(out, "executor_%p:i%d -> cold_%p%d\n", executor, i, executor, i);
2082
                }
2083
#endif
2084
            }
2085
            else {
2086
                fprintf(out, "executor_%p:i%d -> executor_%p:start\n", executor, i, exit->executor);
2087
            }
2088
        }
2089
        if (is_stop(inst)) {
2090
            break;
2091
        }
2092
    }
2093
}
2094
2095
/* Write the graph of all the live tracelets in graphviz format. */
2096
int
2097
_PyDumpExecutors(FILE *out)
2098
{
2099
    fprintf(out, "digraph ideal {\n\n");
2100
    fprintf(out, "    rankdir = \"LR\"\n\n");
2101
    fprintf(out, "    node [colorscheme=greys9]\n");
2102
    PyInterpreterState *interp = PyInterpreterState_Get();
2103
    for (size_t i = 0; i < interp->executor_count; i++) {
2104
        executor_to_gv(interp->executor_ptrs[i], out);
2105
    }
2106
    fprintf(out, "}\n\n");
2107
    return 0;
2108
}
2109
2110
#else
2111
2112
int
2113
_PyDumpExecutors(FILE *out)
2114
0
{
2115
0
    PyErr_SetString(PyExc_NotImplementedError, "No JIT available");
2116
0
    return -1;
2117
0
}
2118
2119
void
2120
_PyExecutor_Free(struct _PyExecutorObject *self)
2121
0
{
2122
    /* This should never be called */
2123
0
    Py_UNREACHABLE();
2124
0
}
2125
2126
#endif /* _Py_TIER2 */