Coverage Report

Created: 2025-11-24 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/optimizer.c
Line
Count
Source
1
#include "Python.h"
2
3
#ifdef _Py_TIER2
4
5
#include "opcode.h"
6
#include "pycore_interp.h"
7
#include "pycore_backoff.h"
8
#include "pycore_bitutils.h"        // _Py_popcount32()
9
#include "pycore_ceval.h"       // _Py_set_eval_breaker_bit
10
#include "pycore_code.h"            // _Py_GetBaseCodeUnit
11
#include "pycore_function.h"        // _PyFunction_LookupByVersion()
12
#include "pycore_interpframe.h"
13
#include "pycore_object.h"          // _PyObject_GC_UNTRACK()
14
#include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
15
#include "pycore_opcode_utils.h"  // MAX_REAL_OPCODE
16
#include "pycore_optimizer.h"     // _Py_uop_analyze_and_optimize()
17
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
18
#include "pycore_tuple.h"         // _PyTuple_FromArraySteal
19
#include "pycore_unicodeobject.h" // _PyUnicode_FromASCII
20
#include "pycore_uop_ids.h"
21
#include "pycore_jit.h"
22
#include <stdbool.h>
23
#include <stdint.h>
24
#include <stddef.h>
25
26
#define NEED_OPCODE_METADATA
27
#include "pycore_uop_metadata.h" // Uop tables
28
#undef NEED_OPCODE_METADATA
29
30
#define MAX_EXECUTORS_SIZE 256
31
32
// Trace too short, no progress:
33
// _START_EXECUTOR
34
// _MAKE_WARM
35
// _CHECK_VALIDITY
36
// _SET_IP
37
// is 4-5 instructions.
38
#define CODE_SIZE_NO_PROGRESS 5
39
// We start with _START_EXECUTOR, _MAKE_WARM
40
#define CODE_SIZE_EMPTY 2
41
42
#define _PyExecutorObject_CAST(op)  ((_PyExecutorObject *)(op))
43
44
static bool
45
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
46
{
47
    if (code == (PyCodeObject *)&_Py_InitCleanup) {
48
        return false;
49
    }
50
    if (instr->op.code == ENTER_EXECUTOR) {
51
        return true;
52
    }
53
    if (code->co_executors == NULL) {
54
        return true;
55
    }
56
    return code->co_executors->size < MAX_EXECUTORS_SIZE;
57
}
58
59
static int32_t
60
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
61
{
62
    if (instr->op.code == ENTER_EXECUTOR) {
63
        return instr->op.arg;
64
    }
65
    _PyExecutorArray *old = code->co_executors;
66
    int size = 0;
67
    int capacity = 0;
68
    if (old != NULL) {
69
        size = old->size;
70
        capacity = old->capacity;
71
        assert(size < MAX_EXECUTORS_SIZE);
72
    }
73
    assert(size <= capacity);
74
    if (size == capacity) {
75
        /* Array is full. Grow array */
76
        int new_capacity = capacity ? capacity * 2 : 4;
77
        _PyExecutorArray *new = PyMem_Realloc(
78
            old,
79
            offsetof(_PyExecutorArray, executors) +
80
            new_capacity * sizeof(_PyExecutorObject *));
81
        if (new == NULL) {
82
            return -1;
83
        }
84
        new->capacity = new_capacity;
85
        new->size = size;
86
        code->co_executors = new;
87
    }
88
    assert(size < code->co_executors->capacity);
89
    return size;
90
}
91
92
static void
93
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
94
{
95
    Py_INCREF(executor);
96
    if (instr->op.code == ENTER_EXECUTOR) {
97
        assert(index == instr->op.arg);
98
        _Py_ExecutorDetach(code->co_executors->executors[index]);
99
    }
100
    else {
101
        assert(code->co_executors->size == index);
102
        assert(code->co_executors->capacity > index);
103
        code->co_executors->size++;
104
    }
105
    executor->vm_data.opcode = instr->op.code;
106
    executor->vm_data.oparg = instr->op.arg;
107
    executor->vm_data.code = code;
108
    executor->vm_data.index = (int)(instr - _PyCode_CODE(code));
109
    code->co_executors->executors[index] = executor;
110
    assert(index < MAX_EXECUTORS_SIZE);
111
    instr->op.code = ENTER_EXECUTOR;
112
    instr->op.arg = index;
113
}
114
115
static _PyExecutorObject *
116
make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies, int chain_depth);
117
118
static int
119
uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate,
120
             _PyExecutorObject **exec_ptr,
121
             bool progress_needed);
122
123
/* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
124
 * If optimized, *executor_ptr contains a new reference to the executor
125
 */
126
// gh-137573: inlining this function causes stack overflows
127
Py_NO_INLINE int
128
_PyOptimizer_Optimize(
129
    _PyInterpreterFrame *frame, PyThreadState *tstate)
130
{
131
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
132
    int chain_depth = _tstate->jit_tracer_state.initial_state.chain_depth;
133
    PyInterpreterState *interp = _PyInterpreterState_GET();
134
    if (!interp->jit) {
135
        // gh-140936: It is possible that interp->jit will become false during
136
        // interpreter finalization. However, the specialized JUMP_BACKWARD_JIT
137
        // instruction may still be present. In this case, we should
138
        // return immediately without optimization.
139
        return 0;
140
    }
141
    assert(!interp->compiling);
142
    assert(_tstate->jit_tracer_state.initial_state.stack_depth >= 0);
143
#ifndef Py_GIL_DISABLED
144
    assert(_tstate->jit_tracer_state.initial_state.func != NULL);
145
    interp->compiling = true;
146
    // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
147
    // make progress in order to avoid infinite loops or excessively-long
148
    // side-exit chains. We can only insert the executor into the bytecode if
149
    // this is true, since a deopt won't infinitely re-enter the executor:
150
    chain_depth %= MAX_CHAIN_DEPTH;
151
    bool progress_needed = chain_depth == 0;
152
    PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state.initial_state.code;
153
    _Py_CODEUNIT *start = _tstate->jit_tracer_state.initial_state.start_instr;
154
    if (progress_needed && !has_space_for_executor(code, start)) {
155
        interp->compiling = false;
156
        return 0;
157
    }
158
    // One of our dependencies while tracing was invalidated. Not worth compiling.
159
    if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) {
160
        interp->compiling = false;
161
        return 0;
162
    }
163
    _PyExecutorObject *executor;
164
    int err = uop_optimize(frame, tstate, &executor, progress_needed);
165
    if (err <= 0) {
166
        interp->compiling = false;
167
        return err;
168
    }
169
    assert(executor != NULL);
170
    if (progress_needed) {
171
        int index = get_index_for_executor(code, start);
172
        if (index < 0) {
173
            /* Out of memory. Don't raise and assume that the
174
             * error will show up elsewhere.
175
             *
176
             * If an optimizer has already produced an executor,
177
             * it might get confused by the executor disappearing,
178
             * but there is not much we can do about that here. */
179
            Py_DECREF(executor);
180
            interp->compiling = false;
181
            return 0;
182
        }
183
        insert_executor(code, start, index, executor);
184
    }
185
    else {
186
        executor->vm_data.code = NULL;
187
    }
188
    _PyExitData *exit = _tstate->jit_tracer_state.initial_state.exit;
189
    if (exit != NULL) {
190
        exit->executor = executor;
191
    }
192
    executor->vm_data.chain_depth = chain_depth;
193
    assert(executor->vm_data.valid);
194
    interp->compiling = false;
195
    return 1;
196
#else
197
    return 0;
198
#endif
199
}
200
201
static _PyExecutorObject *
202
get_executor_lock_held(PyCodeObject *code, int offset)
203
{
204
    int code_len = (int)Py_SIZE(code);
205
    for (int i = 0 ; i < code_len;) {
206
        if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) {
207
            int oparg = _PyCode_CODE(code)[i].op.arg;
208
            _PyExecutorObject *res = code->co_executors->executors[oparg];
209
            Py_INCREF(res);
210
            return res;
211
        }
212
        i += _PyInstruction_GetLength(code, i);
213
    }
214
    PyErr_SetString(PyExc_ValueError, "no executor at given byte offset");
215
    return NULL;
216
}
217
218
_PyExecutorObject *
219
_Py_GetExecutor(PyCodeObject *code, int offset)
220
{
221
    _PyExecutorObject *executor;
222
    Py_BEGIN_CRITICAL_SECTION(code);
223
    executor = get_executor_lock_held(code, offset);
224
    Py_END_CRITICAL_SECTION();
225
    return executor;
226
}
227
228
static PyObject *
229
is_valid(PyObject *self, PyObject *Py_UNUSED(ignored))
230
{
231
    return PyBool_FromLong(((_PyExecutorObject *)self)->vm_data.valid);
232
}
233
234
static PyObject *
235
get_opcode(PyObject *self, PyObject *Py_UNUSED(ignored))
236
{
237
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.opcode);
238
}
239
240
static PyObject *
241
get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
242
{
243
    return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.oparg);
244
}
245
246
///////////////////// Experimental UOp Optimizer /////////////////////
247
248
static int executor_clear(PyObject *executor);
249
static void unlink_executor(_PyExecutorObject *executor);
250
251
252
void
253
_PyExecutor_Free(_PyExecutorObject *self)
254
{
255
#ifdef _Py_JIT
256
    _PyJIT_Free(self);
257
#endif
258
    PyObject_GC_Del(self);
259
}
260
261
void
262
_Py_ClearExecutorDeletionList(PyInterpreterState *interp)
263
{
264
    _PyRuntimeState *runtime = &_PyRuntime;
265
    HEAD_LOCK(runtime);
266
    PyThreadState* ts = PyInterpreterState_ThreadHead(interp);
267
    HEAD_UNLOCK(runtime);
268
    while (ts) {
269
        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
270
        if (current != NULL) {
271
            /* Anything in this list will be unlinked, so we can reuse the
272
             * linked field as a reachability marker. */
273
            current->vm_data.linked = 1;
274
        }
275
        HEAD_LOCK(runtime);
276
        ts = PyThreadState_Next(ts);
277
        HEAD_UNLOCK(runtime);
278
    }
279
    _PyExecutorObject **prev_to_next_ptr = &interp->executor_deletion_list_head;
280
    _PyExecutorObject *exec = *prev_to_next_ptr;
281
    while (exec != NULL) {
282
        if (exec->vm_data.linked) {
283
            // This executor is currently executing
284
            exec->vm_data.linked = 0;
285
            prev_to_next_ptr = &exec->vm_data.links.next;
286
        }
287
        else {
288
            *prev_to_next_ptr = exec->vm_data.links.next;
289
            _PyExecutor_Free(exec);
290
        }
291
        exec = *prev_to_next_ptr;
292
    }
293
    interp->executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX;
294
}
295
296
static void
297
add_to_pending_deletion_list(_PyExecutorObject *self)
298
{
299
    PyInterpreterState *interp = PyInterpreterState_Get();
300
    self->vm_data.links.next = interp->executor_deletion_list_head;
301
    interp->executor_deletion_list_head = self;
302
    if (interp->executor_deletion_list_remaining_capacity > 0) {
303
        interp->executor_deletion_list_remaining_capacity--;
304
    }
305
    else {
306
        _Py_ClearExecutorDeletionList(interp);
307
    }
308
}
309
310
static void
311
uop_dealloc(PyObject *op) {
312
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
313
    _PyObject_GC_UNTRACK(self);
314
    assert(self->vm_data.code == NULL);
315
    unlink_executor(self);
316
    // Once unlinked it becomes impossible to invalidate an executor, so do it here.
317
    self->vm_data.valid = 0;
318
    add_to_pending_deletion_list(self);
319
}
320
321
const char *
322
_PyUOpName(int index)
323
{
324
    if (index < 0 || index > MAX_UOP_ID) {
325
        return NULL;
326
    }
327
    return _PyOpcode_uop_name[index];
328
}
329
330
#ifdef Py_DEBUG
331
void
332
_PyUOpPrint(const _PyUOpInstruction *uop)
333
{
334
    const char *name = _PyUOpName(uop->opcode);
335
    if (name == NULL) {
336
        printf("<uop %d>", uop->opcode);
337
    }
338
    else {
339
        printf("%s", name);
340
    }
341
    switch(uop->format) {
342
        case UOP_FORMAT_TARGET:
343
            printf(" (%d, target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
344
                uop->oparg,
345
                uop->target,
346
                (uint64_t)uop->operand0,
347
                (uint64_t)uop->operand1);
348
            break;
349
        case UOP_FORMAT_JUMP:
350
            printf(" (%d, jump_target=%d, operand0=%#" PRIx64 ", operand1=%#" PRIx64,
351
                uop->oparg,
352
                uop->jump_target,
353
                (uint64_t)uop->operand0,
354
                (uint64_t)uop->operand1);
355
            break;
356
        default:
357
            printf(" (%d, Unknown format)", uop->oparg);
358
    }
359
    if (_PyUop_Flags[uop->opcode] & HAS_ERROR_FLAG) {
360
        printf(", error_target=%d", uop->error_target);
361
    }
362
363
    printf(")");
364
}
365
#endif
366
367
static Py_ssize_t
368
uop_len(PyObject *op)
369
{
370
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
371
    return self->code_size;
372
}
373
374
static PyObject *
375
uop_item(PyObject *op, Py_ssize_t index)
376
{
377
    _PyExecutorObject *self = _PyExecutorObject_CAST(op);
378
    Py_ssize_t len = uop_len(op);
379
    if (index < 0 || index >= len) {
380
        PyErr_SetNone(PyExc_IndexError);
381
        return NULL;
382
    }
383
    const char *name = _PyUOpName(self->trace[index].opcode);
384
    if (name == NULL) {
385
        name = "<nil>";
386
    }
387
    PyObject *oname = _PyUnicode_FromASCII(name, strlen(name));
388
    if (oname == NULL) {
389
        return NULL;
390
    }
391
    PyObject *oparg = PyLong_FromUnsignedLong(self->trace[index].oparg);
392
    if (oparg == NULL) {
393
        Py_DECREF(oname);
394
        return NULL;
395
    }
396
    PyObject *target = PyLong_FromUnsignedLong(self->trace[index].target);
397
    if (target == NULL) {
398
        Py_DECREF(oparg);
399
        Py_DECREF(oname);
400
        return NULL;
401
    }
402
    PyObject *operand = PyLong_FromUnsignedLongLong(self->trace[index].operand0);
403
    if (operand == NULL) {
404
        Py_DECREF(target);
405
        Py_DECREF(oparg);
406
        Py_DECREF(oname);
407
        return NULL;
408
    }
409
    PyObject *args[4] = { oname, oparg, target, operand };
410
    return _PyTuple_FromArraySteal(args, 4);
411
}
412
413
PySequenceMethods uop_as_sequence = {
414
    .sq_length = uop_len,
415
    .sq_item = uop_item,
416
};
417
418
static int
419
executor_traverse(PyObject *o, visitproc visit, void *arg)
420
{
421
    _PyExecutorObject *executor = _PyExecutorObject_CAST(o);
422
    for (uint32_t i = 0; i < executor->exit_count; i++) {
423
        Py_VISIT(executor->exits[i].executor);
424
    }
425
    return 0;
426
}
427
428
static PyObject *
429
get_jit_code(PyObject *self, PyObject *Py_UNUSED(ignored))
430
{
431
#ifndef _Py_JIT
432
    PyErr_SetString(PyExc_RuntimeError, "JIT support not enabled.");
433
    return NULL;
434
#else
435
    _PyExecutorObject *executor = _PyExecutorObject_CAST(self);
436
    if (executor->jit_code == NULL || executor->jit_size == 0) {
437
        Py_RETURN_NONE;
438
    }
439
    return PyBytes_FromStringAndSize(executor->jit_code, executor->jit_size);
440
#endif
441
}
442
443
static PyMethodDef uop_executor_methods[] = {
444
    { "is_valid", is_valid, METH_NOARGS, NULL },
445
    { "get_jit_code", get_jit_code, METH_NOARGS, NULL},
446
    { "get_opcode", get_opcode, METH_NOARGS, NULL },
447
    { "get_oparg", get_oparg, METH_NOARGS, NULL },
448
    { NULL, NULL },
449
};
450
451
static int
452
executor_is_gc(PyObject *o)
453
{
454
    return !_Py_IsImmortal(o);
455
}
456
457
PyTypeObject _PyUOpExecutor_Type = {
458
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
459
    .tp_name = "uop_executor",
460
    .tp_basicsize = offsetof(_PyExecutorObject, exits),
461
    .tp_itemsize = 1,
462
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
463
    .tp_dealloc = uop_dealloc,
464
    .tp_as_sequence = &uop_as_sequence,
465
    .tp_methods = uop_executor_methods,
466
    .tp_traverse = executor_traverse,
467
    .tp_clear = executor_clear,
468
    .tp_is_gc = executor_is_gc,
469
};
470
471
/* TO DO -- Generate these tables */
472
static const uint16_t
473
_PyUOp_Replacements[MAX_UOP_ID + 1] = {
474
    [_ITER_JUMP_RANGE] = _GUARD_NOT_EXHAUSTED_RANGE,
475
    [_ITER_JUMP_LIST] = _GUARD_NOT_EXHAUSTED_LIST,
476
    [_ITER_JUMP_TUPLE] = _GUARD_NOT_EXHAUSTED_TUPLE,
477
    [_FOR_ITER] = _FOR_ITER_TIER_TWO,
478
    [_ITER_NEXT_LIST] = _ITER_NEXT_LIST_TIER_TWO,
479
    [_CHECK_PERIODIC_AT_END] = _TIER2_RESUME_CHECK,
480
};
481
482
static const uint8_t
483
is_for_iter_test[MAX_UOP_ID + 1] = {
484
    [_GUARD_NOT_EXHAUSTED_RANGE] = 1,
485
    [_GUARD_NOT_EXHAUSTED_LIST] = 1,
486
    [_GUARD_NOT_EXHAUSTED_TUPLE] = 1,
487
    [_FOR_ITER_TIER_TWO] = 1,
488
};
489
490
static const uint16_t
491
BRANCH_TO_GUARD[4][2] = {
492
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_TRUE_POP,
493
    [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_FALSE_POP,
494
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_FALSE_POP,
495
    [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_TRUE_POP,
496
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NOT_NONE_POP,
497
    [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NONE_POP,
498
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NONE_POP,
499
    [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
500
};
501
502
static const uint16_t
503
guard_ip_uop[MAX_UOP_ID + 1] = {
504
    [_PUSH_FRAME] = _GUARD_IP__PUSH_FRAME,
505
    [_RETURN_GENERATOR] = _GUARD_IP_RETURN_GENERATOR,
506
    [_RETURN_VALUE] = _GUARD_IP_RETURN_VALUE,
507
    [_YIELD_VALUE] = _GUARD_IP_YIELD_VALUE,
508
};
509
510
511
#define CONFIDENCE_RANGE 1000
512
#define CONFIDENCE_CUTOFF 333
513
514
#ifdef Py_DEBUG
515
#define DPRINTF(level, ...) \
516
    if (lltrace >= (level)) { printf(__VA_ARGS__); }
517
#else
518
#define DPRINTF(level, ...)
519
#endif
520
521
522
static inline int
523
add_to_trace(
524
    _PyUOpInstruction *trace,
525
    int trace_length,
526
    uint16_t opcode,
527
    uint16_t oparg,
528
    uint64_t operand,
529
    uint32_t target)
530
{
531
    trace[trace_length].opcode = opcode;
532
    trace[trace_length].format = UOP_FORMAT_TARGET;
533
    trace[trace_length].target = target;
534
    trace[trace_length].oparg = oparg;
535
    trace[trace_length].operand0 = operand;
536
#ifdef Py_STATS
537
    trace[trace_length].execution_count = 0;
538
#endif
539
    return trace_length + 1;
540
}
541
542
#ifdef Py_DEBUG
543
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
544
    assert(trace_length < max_length); \
545
    trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
546
    if (lltrace >= 2) { \
547
        printf("%4d ADD_TO_TRACE: ", trace_length); \
548
        _PyUOpPrint(&trace[trace_length-1]); \
549
        printf("\n"); \
550
    }
551
#else
552
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
553
    assert(trace_length < max_length); \
554
    trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET));
555
#endif
556
557
#define INSTR_IP(INSTR, CODE) \
558
    ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
559
560
// Reserve space for n uops
561
#define RESERVE_RAW(n, opname) \
562
    if (trace_length + (n) > max_length) { \
563
        DPRINTF(2, "No room for %s (need %d, got %d)\n", \
564
                (opname), (n), max_length - trace_length); \
565
        OPT_STAT_INC(trace_too_long); \
566
        goto full; \
567
    }
568
569
570
/* Returns 1 on success (added to trace), 0 on trace end.
571
 */
572
int
573
_PyJit_translate_single_bytecode_to_trace(
574
    PyThreadState *tstate,
575
    _PyInterpreterFrame *frame,
576
    _Py_CODEUNIT *next_instr,
577
    int stop_tracing_opcode)
578
{
579
580
#ifdef Py_DEBUG
581
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
582
    int lltrace = 0;
583
    if (python_lltrace != NULL && *python_lltrace >= '0') {
584
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
585
    }
586
#endif
587
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
588
    PyCodeObject *old_code = _tstate->jit_tracer_state.prev_state.instr_code;
589
    bool progress_needed = (_tstate->jit_tracer_state.initial_state.chain_depth % MAX_CHAIN_DEPTH) == 0;
590
    _PyBloomFilter *dependencies = &_tstate->jit_tracer_state.prev_state.dependencies;
591
    int trace_length = _tstate->jit_tracer_state.prev_state.code_curr_size;
592
    _PyUOpInstruction *trace = _tstate->jit_tracer_state.code_buffer;
593
    int max_length = _tstate->jit_tracer_state.prev_state.code_max_size;
594
595
    _Py_CODEUNIT *this_instr =  _tstate->jit_tracer_state.prev_state.instr;
596
    _Py_CODEUNIT *target_instr = this_instr;
597
    uint32_t target = 0;
598
599
    target = Py_IsNone((PyObject *)old_code)
600
        ? (int)(target_instr - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR)
601
        : INSTR_IP(target_instr, old_code);
602
603
    // Rewind EXTENDED_ARG so that we see the whole thing.
604
    // We must point to the first EXTENDED_ARG when deopting.
605
    int oparg = _tstate->jit_tracer_state.prev_state.instr_oparg;
606
    int opcode = this_instr->op.code;
607
    int rewind_oparg = oparg;
608
    while (rewind_oparg > 255) {
609
        rewind_oparg >>= 8;
610
        target--;
611
    }
612
613
    int old_stack_level = _tstate->jit_tracer_state.prev_state.instr_stacklevel;
614
615
    // Strange control-flow
616
    bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) &&
617
        (next_instr != this_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]]);
618
619
    /* Special case the first instruction,
620
    * so that we can guarantee forward progress */
621
    if (progress_needed && _tstate->jit_tracer_state.prev_state.code_curr_size < CODE_SIZE_NO_PROGRESS) {
622
        if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
623
            opcode = _PyOpcode_Deopt[opcode];
624
        }
625
        assert(!OPCODE_HAS_EXIT(opcode));
626
        assert(!OPCODE_HAS_DEOPT(opcode));
627
    }
628
629
    bool needs_guard_ip = OPCODE_HAS_NEEDS_GUARD_IP(opcode);
630
    if (has_dynamic_jump_taken && !needs_guard_ip) {
631
        DPRINTF(2, "Unsupported: dynamic jump taken %s\n", _PyOpcode_OpName[opcode]);
632
        goto unsupported;
633
    }
634
635
    int is_sys_tracing = (tstate->c_tracefunc != NULL) || (tstate->c_profilefunc != NULL);
636
    if (is_sys_tracing) {
637
        goto full;
638
    }
639
640
    if (stop_tracing_opcode != 0) {
641
        ADD_TO_TRACE(stop_tracing_opcode, 0, 0, target);
642
        goto done;
643
    }
644
645
    DPRINTF(2, "%p %d: %s(%d) %d %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip, old_stack_level);
646
647
#ifdef Py_DEBUG
648
    if (oparg > 255) {
649
        assert(_Py_GetBaseCodeUnit(old_code, target).op.code == EXTENDED_ARG);
650
    }
651
#endif
652
653
    // Skip over super instructions.
654
    if (_tstate->jit_tracer_state.prev_state.instr_is_super) {
655
        _tstate->jit_tracer_state.prev_state.instr_is_super = false;
656
        return 1;
657
    }
658
659
    if (opcode == ENTER_EXECUTOR) {
660
        goto full;
661
    }
662
663
    if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) {
664
        goto done;
665
    }
666
667
    // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls
668
    // If we haven't guarded the IP, then it's untraceable.
669
    if (frame != _tstate->jit_tracer_state.prev_state.instr_frame && !needs_guard_ip) {
670
        DPRINTF(2, "Unsupported: unguardable jump taken\n");
671
        goto unsupported;
672
    }
673
674
    if (oparg > 0xFFFF) {
675
        DPRINTF(2, "Unsupported: oparg too large\n");
676
        goto unsupported;
677
    }
678
679
    // TODO (gh-140277): The constituent use one extra stack slot. So we need to check for headroom.
680
    if (opcode == BINARY_OP_SUBSCR_GETITEM && old_stack_level + 1 > old_code->co_stacksize) {
681
        unsupported:
682
        {
683
            // Rewind to previous instruction and replace with _EXIT_TRACE.
684
            _PyUOpInstruction *curr = &trace[trace_length-1];
685
            while (curr->opcode != _SET_IP && trace_length > 2) {
686
                trace_length--;
687
                curr = &trace[trace_length-1];
688
            }
689
            assert(curr->opcode == _SET_IP || trace_length == 2);
690
            if (curr->opcode == _SET_IP) {
691
                int32_t old_target = (int32_t)uop_get_target(curr);
692
                curr++;
693
                trace_length++;
694
                curr->opcode = _EXIT_TRACE;
695
                curr->format = UOP_FORMAT_TARGET;
696
                curr->target = old_target;
697
            }
698
            goto done;
699
        }
700
    }
701
702
    if (opcode == NOP) {
703
        return 1;
704
    }
705
706
    if (opcode == JUMP_FORWARD) {
707
        return 1;
708
    }
709
710
    if (opcode == EXTENDED_ARG) {
711
        return 1;
712
    }
713
714
    // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
715
    max_length -= 2;
716
717
    const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
718
719
    assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
720
    assert(!_PyErr_Occurred(tstate));
721
722
723
    if (OPCODE_HAS_EXIT(opcode)) {
724
        // Make space for side exit and final _EXIT_TRACE:
725
        max_length--;
726
    }
727
    if (OPCODE_HAS_ERROR(opcode)) {
728
        // Make space for error stub and final _EXIT_TRACE:
729
        max_length--;
730
    }
731
732
    // _GUARD_IP leads to an exit.
733
    max_length -= needs_guard_ip;
734
735
    RESERVE_RAW(expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode)), "uop and various checks");
736
737
    ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
738
739
    if (!OPCODE_HAS_NO_SAVE_IP(opcode)) {
740
        ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)target_instr, target);
741
    }
742
743
    // Can be NULL for the entry frame.
744
    if (old_code != NULL) {
745
        _Py_BloomFilter_Add(dependencies, old_code);
746
    }
747
748
    switch (opcode) {
749
        case POP_JUMP_IF_NONE:
750
        case POP_JUMP_IF_NOT_NONE:
751
        case POP_JUMP_IF_FALSE:
752
        case POP_JUMP_IF_TRUE:
753
        {
754
            _Py_CODEUNIT *computed_next_instr_without_modifiers = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
755
            _Py_CODEUNIT *computed_next_instr = computed_next_instr_without_modifiers + (computed_next_instr_without_modifiers->op.code == NOT_TAKEN);
756
            _Py_CODEUNIT *computed_jump_instr = computed_next_instr_without_modifiers + oparg;
757
            assert(next_instr == computed_next_instr || next_instr == computed_jump_instr);
758
            int jump_happened = computed_jump_instr == next_instr;
759
            assert(jump_happened == (target_instr[1].cache & 1));
760
            uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
761
            ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
762
            break;
763
        }
764
        case JUMP_BACKWARD_JIT:
765
            // This is possible as the JIT might have re-activated after it was disabled
766
        case JUMP_BACKWARD_NO_JIT:
767
        case JUMP_BACKWARD:
768
            ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
769
            _Py_FALLTHROUGH;
770
        case JUMP_BACKWARD_NO_INTERRUPT:
771
        {
772
            if ((next_instr != _tstate->jit_tracer_state.initial_state.close_loop_instr) &&
773
                (next_instr != _tstate->jit_tracer_state.initial_state.start_instr) &&
774
                _tstate->jit_tracer_state.prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS &&
775
                // For side exits, we don't want to terminate them early.
776
                _tstate->jit_tracer_state.initial_state.exit == NULL &&
777
                // These are coroutines, and we want to unroll those usually.
778
                opcode != JUMP_BACKWARD_NO_INTERRUPT) {
779
                // We encountered a JUMP_BACKWARD but not to the top of our own loop.
780
                // We don't want to continue tracing as we might get stuck in the
781
                // inner loop. Instead, end the trace where the executor of the
782
                // inner loop might start and let the traces rejoin.
783
                OPT_STAT_INC(inner_loop);
784
                ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
785
                trace[trace_length-1].operand1 = true; // is_control_flow
786
                DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
787
                    _tstate->jit_tracer_state.initial_state.close_loop_instr, _tstate->jit_tracer_state.initial_state.start_instr);
788
                goto done;
789
            }
790
            break;
791
        }
792
793
        case RESUME:
794
        case RESUME_CHECK:
795
            /* Use a special tier 2 version of RESUME_CHECK to allow traces to
796
             *  start with RESUME_CHECK */
797
            ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
798
            break;
799
        default:
800
        {
801
            const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
802
            // Reserve space for nuops (+ _SET_IP + _EXIT_TRACE)
803
            int nuops = expansion->nuops;
804
            if (nuops == 0) {
805
                DPRINTF(2, "Unsupported opcode %s\n", _PyOpcode_OpName[opcode]);
806
                goto unsupported;
807
            }
808
            assert(nuops > 0);
809
            uint32_t orig_oparg = oparg;  // For OPARG_TOP/BOTTOM
810
            uint32_t orig_target = target;
811
            for (int i = 0; i < nuops; i++) {
812
                oparg = orig_oparg;
813
                target = orig_target;
814
                uint32_t uop = expansion->uops[i].uop;
815
                uint64_t operand = 0;
816
                // Add one to account for the actual opcode/oparg pair:
817
                int offset = expansion->uops[i].offset + 1;
818
                switch (expansion->uops[i].size) {
819
                    case OPARG_SIMPLE:
820
                        assert(opcode != _JUMP_BACKWARD_NO_INTERRUPT && opcode != JUMP_BACKWARD);
821
                        break;
822
                    case OPARG_CACHE_1:
823
                        operand = read_u16(&this_instr[offset].cache);
824
                        break;
825
                    case OPARG_CACHE_2:
826
                        operand = read_u32(&this_instr[offset].cache);
827
                        break;
828
                    case OPARG_CACHE_4:
829
                        operand = read_u64(&this_instr[offset].cache);
830
                        break;
831
                    case OPARG_TOP:  // First half of super-instr
832
                        assert(orig_oparg <= 255);
833
                        oparg = orig_oparg >> 4;
834
                        break;
835
                    case OPARG_BOTTOM:  // Second half of super-instr
836
                        assert(orig_oparg <= 255);
837
                        oparg = orig_oparg & 0xF;
838
                        break;
839
                    case OPARG_SAVE_RETURN_OFFSET:  // op=_SAVE_RETURN_OFFSET; oparg=return_offset
840
                        oparg = offset;
841
                        assert(uop == _SAVE_RETURN_OFFSET);
842
                        break;
843
                    case OPARG_REPLACED:
844
                        uop = _PyUOp_Replacements[uop];
845
                        assert(uop != 0);
846
847
                        uint32_t next_inst = target + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
848
                        if (uop == _TIER2_RESUME_CHECK) {
849
                            target = next_inst;
850
                        }
851
                        else {
852
                            int extended_arg = orig_oparg > 255;
853
                            uint32_t jump_target = next_inst + orig_oparg + extended_arg;
854
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target).op.code == END_FOR);
855
                            assert(_Py_GetBaseCodeUnit(old_code, jump_target+1).op.code == POP_ITER);
856
                            if (is_for_iter_test[uop]) {
857
                                target = jump_target + 1;
858
                            }
859
                        }
860
                        break;
861
                    case OPERAND1_1:
862
                        assert(trace[trace_length-1].opcode == uop);
863
                        operand = read_u16(&this_instr[offset].cache);
864
                        trace[trace_length-1].operand1 = operand;
865
                        continue;
866
                    case OPERAND1_2:
867
                        assert(trace[trace_length-1].opcode == uop);
868
                        operand = read_u32(&this_instr[offset].cache);
869
                        trace[trace_length-1].operand1 = operand;
870
                        continue;
871
                    case OPERAND1_4:
872
                        assert(trace[trace_length-1].opcode == uop);
873
                        operand = read_u64(&this_instr[offset].cache);
874
                        trace[trace_length-1].operand1 = operand;
875
                        continue;
876
                    default:
877
                        fprintf(stderr,
878
                                "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n",
879
                                opcode, oparg, nuops, i,
880
                                expansion->uops[i].size,
881
                                expansion->uops[i].offset);
882
                        Py_FatalError("garbled expansion");
883
                }
884
                if (uop == _PUSH_FRAME || uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
885
                    PyCodeObject *new_code = (PyCodeObject *)PyStackRef_AsPyObjectBorrow(frame->f_executable);
886
                    PyFunctionObject *new_func = (PyFunctionObject *)PyStackRef_AsPyObjectBorrow(frame->f_funcobj);
887
888
                    operand = 0;
889
                    if (frame->owner < FRAME_OWNED_BY_INTERPRETER) {
890
                        // Don't add nested code objects to the dependency.
891
                        // It causes endless re-traces.
892
                        if (new_func != NULL && !Py_IsNone((PyObject*)new_func) && !(new_code->co_flags & CO_NESTED)) {
893
                            operand = (uintptr_t)new_func;
894
                            DPRINTF(2, "Adding %p func to op\n", (void *)operand);
895
                            _Py_BloomFilter_Add(dependencies, new_func);
896
                        }
897
                        else if (new_code != NULL && !Py_IsNone((PyObject*)new_code)) {
898
                            operand = (uintptr_t)new_code | 1;
899
                            DPRINTF(2, "Adding %p code to op\n", (void *)operand);
900
                            _Py_BloomFilter_Add(dependencies, new_code);
901
                        }
902
                    }
903
                    ADD_TO_TRACE(uop, oparg, operand, target);
904
                    trace[trace_length - 1].operand1 = PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - _PyFrame_Stackbase(frame)));
905
                    break;
906
                }
907
                if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
908
                    assert(i + 1 == nuops);
909
                    _Py_CODEUNIT *next = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
910
                    assert(next->op.code == STORE_FAST);
911
                    operand = next->op.arg;
912
                }
913
                // All other instructions
914
                ADD_TO_TRACE(uop, oparg, operand, target);
915
            }
916
            break;
917
        }  // End default
918
919
    }  // End switch (opcode)
920
921
    if (needs_guard_ip) {
922
        uint16_t guard_ip = guard_ip_uop[trace[trace_length-1].opcode];
923
        if (guard_ip == 0) {
924
            DPRINTF(1, "Unknown uop needing guard ip %s\n", _PyOpcode_uop_name[trace[trace_length-1].opcode]);
925
            Py_UNREACHABLE();
926
        }
927
        ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0);
928
    }
929
    // Loop back to the start
930
    int is_first_instr = _tstate->jit_tracer_state.initial_state.close_loop_instr == next_instr ||
931
        _tstate->jit_tracer_state.initial_state.start_instr == next_instr;
932
    if (is_first_instr && _tstate->jit_tracer_state.prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS) {
933
        if (needs_guard_ip) {
934
            ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0);
935
        }
936
        ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
937
        goto done;
938
    }
939
    DPRINTF(2, "Trace continuing\n");
940
    _tstate->jit_tracer_state.prev_state.code_curr_size = trace_length;
941
    _tstate->jit_tracer_state.prev_state.code_max_size = max_length;
942
    return 1;
943
done:
944
    DPRINTF(2, "Trace done\n");
945
    _tstate->jit_tracer_state.prev_state.code_curr_size = trace_length;
946
    _tstate->jit_tracer_state.prev_state.code_max_size = max_length;
947
    return 0;
948
full:
949
    DPRINTF(2, "Trace full\n");
950
    if (!is_terminator(&_tstate->jit_tracer_state.code_buffer[trace_length-1])) {
951
        // Undo the last few instructions.
952
        trace_length = _tstate->jit_tracer_state.prev_state.code_curr_size;
953
        max_length = _tstate->jit_tracer_state.prev_state.code_max_size;
954
        // We previously reversed one.
955
        max_length += 1;
956
        ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
957
        trace[trace_length-1].operand1 = true; // is_control_flow
958
    }
959
    _tstate->jit_tracer_state.prev_state.code_curr_size = trace_length;
960
    _tstate->jit_tracer_state.prev_state.code_max_size = max_length;
961
    return 0;
962
}
963
964
// Returns 0 for do not enter tracing, 1 on enter tracing.
965
int
966
_PyJit_TryInitializeTracing(
967
    PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *curr_instr,
968
    _Py_CODEUNIT *start_instr, _Py_CODEUNIT *close_loop_instr, int curr_stackdepth, int chain_depth,
969
    _PyExitData *exit, int oparg)
970
{
971
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
972
    // A recursive trace.
973
    // Don't trace into the inner call because it will stomp on the previous trace, causing endless retraces.
974
    if (_tstate->jit_tracer_state.prev_state.code_curr_size > CODE_SIZE_EMPTY) {
975
        return 0;
976
    }
977
    if (oparg > 0xFFFF) {
978
        return 0;
979
    }
980
    if (_tstate->jit_tracer_state.code_buffer == NULL) {
981
        _tstate->jit_tracer_state.code_buffer = (_PyUOpInstruction *)_PyObject_VirtualAlloc(UOP_BUFFER_SIZE);
982
        if (_tstate->jit_tracer_state.code_buffer == NULL) {
983
            // Don't error, just go to next instruction.
984
            return 0;
985
        }
986
    }
987
    PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj);
988
    if (func == NULL) {
989
        return 0;
990
    }
991
    PyCodeObject *code = _PyFrame_GetCode(frame);
992
#ifdef Py_DEBUG
993
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
994
    int lltrace = 0;
995
    if (python_lltrace != NULL && *python_lltrace >= '0') {
996
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
997
    }
998
    DPRINTF(2,
999
        "Tracing %s (%s:%d) at byte offset %d at chain depth %d\n",
1000
        PyUnicode_AsUTF8(code->co_qualname),
1001
        PyUnicode_AsUTF8(code->co_filename),
1002
        code->co_firstlineno,
1003
        2 * INSTR_IP(close_loop_instr, code),
1004
        chain_depth);
1005
#endif
1006
1007
    add_to_trace(_tstate->jit_tracer_state.code_buffer, 0, _START_EXECUTOR, 0, (uintptr_t)start_instr, INSTR_IP(start_instr, code));
1008
    add_to_trace(_tstate->jit_tracer_state.code_buffer, 1, _MAKE_WARM, 0, 0, 0);
1009
    _tstate->jit_tracer_state.prev_state.code_curr_size = CODE_SIZE_EMPTY;
1010
1011
    _tstate->jit_tracer_state.prev_state.code_max_size = UOP_MAX_TRACE_LENGTH;
1012
    _tstate->jit_tracer_state.initial_state.start_instr = start_instr;
1013
    _tstate->jit_tracer_state.initial_state.close_loop_instr = close_loop_instr;
1014
    _tstate->jit_tracer_state.initial_state.code = (PyCodeObject *)Py_NewRef(code);
1015
    _tstate->jit_tracer_state.initial_state.func = (PyFunctionObject *)Py_NewRef(func);
1016
    _tstate->jit_tracer_state.initial_state.exit = exit;
1017
    _tstate->jit_tracer_state.initial_state.stack_depth = curr_stackdepth;
1018
    _tstate->jit_tracer_state.initial_state.chain_depth = chain_depth;
1019
    _tstate->jit_tracer_state.prev_state.instr_frame = frame;
1020
    _tstate->jit_tracer_state.prev_state.dependencies_still_valid = true;
1021
    _tstate->jit_tracer_state.prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame));
1022
    _tstate->jit_tracer_state.prev_state.instr = curr_instr;
1023
    _tstate->jit_tracer_state.prev_state.instr_frame = frame;
1024
    _tstate->jit_tracer_state.prev_state.instr_oparg = oparg;
1025
    _tstate->jit_tracer_state.prev_state.instr_stacklevel = curr_stackdepth;
1026
    _tstate->jit_tracer_state.prev_state.instr_is_super = false;
1027
    assert(curr_instr->op.code == JUMP_BACKWARD_JIT || (exit != NULL));
1028
    _tstate->jit_tracer_state.initial_state.jump_backward_instr = curr_instr;
1029
1030
    if (_PyOpcode_Caches[_PyOpcode_Deopt[close_loop_instr->op.code]]) {
1031
        close_loop_instr[1].counter = trigger_backoff_counter();
1032
    }
1033
    _Py_BloomFilter_Init(&_tstate->jit_tracer_state.prev_state.dependencies);
1034
    return 1;
1035
}
1036
1037
void
1038
_PyJit_FinalizeTracing(PyThreadState *tstate)
1039
{
1040
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1041
    Py_CLEAR(_tstate->jit_tracer_state.initial_state.code);
1042
    Py_CLEAR(_tstate->jit_tracer_state.initial_state.func);
1043
    Py_CLEAR(_tstate->jit_tracer_state.prev_state.instr_code);
1044
    _tstate->jit_tracer_state.prev_state.code_curr_size = CODE_SIZE_EMPTY;
1045
    _tstate->jit_tracer_state.prev_state.code_max_size = UOP_MAX_TRACE_LENGTH - 1;
1046
}
1047
1048
1049
#undef RESERVE
1050
#undef RESERVE_RAW
1051
#undef INSTR_IP
1052
#undef ADD_TO_TRACE
1053
#undef DPRINTF
1054
1055
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
1056
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
1057
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
1058
1059
/* Count the number of unused uops and exits
1060
*/
1061
static int
1062
count_exits(_PyUOpInstruction *buffer, int length)
1063
{
1064
    int exit_count = 0;
1065
    for (int i = 0; i < length; i++) {
1066
        int opcode = buffer[i].opcode;
1067
        if (opcode == _EXIT_TRACE || opcode == _DYNAMIC_EXIT) {
1068
            exit_count++;
1069
        }
1070
    }
1071
    return exit_count;
1072
}
1073
1074
static void make_exit(_PyUOpInstruction *inst, int opcode, int target, bool is_control_flow)
1075
{
1076
    inst->opcode = opcode;
1077
    inst->oparg = 0;
1078
    inst->operand0 = 0;
1079
    inst->format = UOP_FORMAT_TARGET;
1080
    inst->target = target;
1081
    inst->operand1 = is_control_flow;
1082
#ifdef Py_STATS
1083
    inst->execution_count = 0;
1084
#endif
1085
}
1086
1087
/* Convert implicit exits, errors and deopts
1088
 * into explicit ones. */
1089
static int
1090
prepare_for_execution(_PyUOpInstruction *buffer, int length)
1091
{
1092
    int32_t current_jump = -1;
1093
    int32_t current_jump_target = -1;
1094
    int32_t current_error = -1;
1095
    int32_t current_error_target = -1;
1096
    int32_t current_popped = -1;
1097
    int32_t current_exit_op = -1;
1098
    /* Leaving in NOPs slows down the interpreter and messes up the stats */
1099
    _PyUOpInstruction *copy_to = &buffer[0];
1100
    for (int i = 0; i < length; i++) {
1101
        _PyUOpInstruction *inst = &buffer[i];
1102
        if (inst->opcode != _NOP) {
1103
            if (copy_to != inst) {
1104
                *copy_to = *inst;
1105
            }
1106
            copy_to++;
1107
        }
1108
    }
1109
    length = (int)(copy_to - buffer);
1110
    int next_spare = length;
1111
    for (int i = 0; i < length; i++) {
1112
        _PyUOpInstruction *inst = &buffer[i];
1113
        int opcode = inst->opcode;
1114
        int32_t target = (int32_t)uop_get_target(inst);
1115
        uint16_t exit_flags = _PyUop_Flags[opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG | HAS_PERIODIC_FLAG);
1116
        if (exit_flags) {
1117
            uint16_t exit_op = _EXIT_TRACE;
1118
            if (exit_flags & HAS_DEOPT_FLAG) {
1119
                exit_op = _DEOPT;
1120
            }
1121
            else if (exit_flags & HAS_PERIODIC_FLAG) {
1122
                exit_op = _HANDLE_PENDING_AND_DEOPT;
1123
            }
1124
            int32_t jump_target = target;
1125
            if (
1126
                opcode == _GUARD_IP__PUSH_FRAME ||
1127
                opcode == _GUARD_IP_RETURN_VALUE ||
1128
                opcode == _GUARD_IP_YIELD_VALUE ||
1129
                opcode == _GUARD_IP_RETURN_GENERATOR
1130
            ) {
1131
                exit_op = _DYNAMIC_EXIT;
1132
            }
1133
            bool is_control_flow = (opcode == _GUARD_IS_FALSE_POP || opcode == _GUARD_IS_TRUE_POP || is_for_iter_test[opcode]);
1134
            if (jump_target != current_jump_target || current_exit_op != exit_op) {
1135
                make_exit(&buffer[next_spare], exit_op, jump_target, is_control_flow);
1136
                current_exit_op = exit_op;
1137
                current_jump_target = jump_target;
1138
                current_jump = next_spare;
1139
                next_spare++;
1140
            }
1141
            buffer[i].jump_target = current_jump;
1142
            buffer[i].format = UOP_FORMAT_JUMP;
1143
        }
1144
        if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
1145
            int popped = (_PyUop_Flags[opcode] & HAS_ERROR_NO_POP_FLAG) ?
1146
                0 : _PyUop_num_popped(opcode, inst->oparg);
1147
            if (target != current_error_target || popped != current_popped) {
1148
                current_popped = popped;
1149
                current_error = next_spare;
1150
                current_error_target = target;
1151
                make_exit(&buffer[next_spare], _ERROR_POP_N, 0, false);
1152
                buffer[next_spare].operand0 = target;
1153
                next_spare++;
1154
            }
1155
            buffer[i].error_target = current_error;
1156
            if (buffer[i].format == UOP_FORMAT_TARGET) {
1157
                buffer[i].format = UOP_FORMAT_JUMP;
1158
                buffer[i].jump_target = 0;
1159
            }
1160
        }
1161
        if (opcode == _JUMP_TO_TOP) {
1162
            assert(buffer[0].opcode == _START_EXECUTOR);
1163
            buffer[i].format = UOP_FORMAT_JUMP;
1164
            buffer[i].jump_target = 1;
1165
        }
1166
    }
1167
    return next_spare;
1168
}
1169
1170
/* Executor side exits */
1171
1172
static _PyExecutorObject *
1173
allocate_executor(int exit_count, int length)
1174
{
1175
    int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
1176
    _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
1177
    if (res == NULL) {
1178
        return NULL;
1179
    }
1180
    res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
1181
    res->code_size = length;
1182
    res->exit_count = exit_count;
1183
    return res;
1184
}
1185
1186
#ifdef Py_DEBUG
1187
1188
#define CHECK(PRED) \
1189
if (!(PRED)) { \
1190
    printf(#PRED " at %d\n", i); \
1191
    assert(0); \
1192
}
1193
1194
static int
1195
target_unused(int opcode)
1196
{
1197
    return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0;
1198
}
1199
1200
static void
1201
sanity_check(_PyExecutorObject *executor)
1202
{
1203
    for (uint32_t i = 0; i < executor->exit_count; i++) {
1204
        _PyExitData *exit = &executor->exits[i];
1205
        CHECK(exit->target < (1 << 25));
1206
    }
1207
    bool ended = false;
1208
    uint32_t i = 0;
1209
    CHECK(executor->trace[0].opcode == _START_EXECUTOR ||
1210
        executor->trace[0].opcode == _COLD_EXIT ||
1211
        executor->trace[0].opcode == _COLD_DYNAMIC_EXIT);
1212
    for (; i < executor->code_size; i++) {
1213
        const _PyUOpInstruction *inst = &executor->trace[i];
1214
        uint16_t opcode = inst->opcode;
1215
        CHECK(opcode <= MAX_UOP_ID);
1216
        CHECK(_PyOpcode_uop_name[opcode] != NULL);
1217
        switch(inst->format) {
1218
            case UOP_FORMAT_TARGET:
1219
                CHECK(target_unused(opcode));
1220
                break;
1221
            case UOP_FORMAT_JUMP:
1222
                CHECK(inst->jump_target < executor->code_size);
1223
                break;
1224
        }
1225
        if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
1226
            CHECK(inst->format == UOP_FORMAT_JUMP);
1227
            CHECK(inst->error_target < executor->code_size);
1228
        }
1229
        if (is_terminator(inst)) {
1230
            ended = true;
1231
            i++;
1232
            break;
1233
        }
1234
    }
1235
    CHECK(ended);
1236
    for (; i < executor->code_size; i++) {
1237
        const _PyUOpInstruction *inst = &executor->trace[i];
1238
        uint16_t opcode = inst->opcode;
1239
        CHECK(
1240
            opcode == _DEOPT ||
1241
            opcode == _HANDLE_PENDING_AND_DEOPT ||
1242
            opcode == _EXIT_TRACE ||
1243
            opcode == _ERROR_POP_N ||
1244
            opcode == _DYNAMIC_EXIT);
1245
    }
1246
}
1247
1248
#undef CHECK
1249
#endif
1250
1251
/* Makes an executor from a buffer of uops.
1252
 * Account for the buffer having gaps and NOPs by computing a "used"
1253
 * bit vector and only copying the used uops. Here "used" means reachable
1254
 * and not a NOP.
1255
 */
1256
static _PyExecutorObject *
1257
make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies, int chain_depth)
1258
{
1259
    int exit_count = count_exits(buffer, length);
1260
    _PyExecutorObject *executor = allocate_executor(exit_count, length);
1261
    if (executor == NULL) {
1262
        return NULL;
1263
    }
1264
1265
    /* Initialize exits */
1266
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
1267
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
1268
    cold->vm_data.chain_depth = chain_depth;
1269
    for (int i = 0; i < exit_count; i++) {
1270
        executor->exits[i].index = i;
1271
        executor->exits[i].temperature = initial_temperature_backoff_counter();
1272
    }
1273
    int next_exit = exit_count-1;
1274
    _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
1275
    assert(buffer[0].opcode == _START_EXECUTOR);
1276
    buffer[0].operand0 = (uint64_t)executor;
1277
    for (int i = length-1; i >= 0; i--) {
1278
        int opcode = buffer[i].opcode;
1279
        dest--;
1280
        *dest = buffer[i];
1281
        if (opcode == _EXIT_TRACE || opcode == _DYNAMIC_EXIT) {
1282
            _PyExitData *exit = &executor->exits[next_exit];
1283
            exit->target = buffer[i].target;
1284
            dest->operand0 = (uint64_t)exit;
1285
            exit->executor = opcode == _EXIT_TRACE ? cold : cold_dynamic;
1286
            exit->is_dynamic = (char)(opcode == _DYNAMIC_EXIT);
1287
            exit->is_control_flow = (char)buffer[i].operand1;
1288
            next_exit--;
1289
        }
1290
    }
1291
    assert(next_exit == -1);
1292
    assert(dest == executor->trace);
1293
    assert(dest->opcode == _START_EXECUTOR);
1294
    _Py_ExecutorInit(executor, dependencies);
1295
#ifdef Py_DEBUG
1296
    char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1297
    int lltrace = 0;
1298
    if (python_lltrace != NULL && *python_lltrace >= '0') {
1299
        lltrace = *python_lltrace - '0';  // TODO: Parse an int and all that
1300
    }
1301
    if (lltrace >= 2) {
1302
        printf("Optimized trace (length %d):\n", length);
1303
        for (int i = 0; i < length; i++) {
1304
            printf("%4d OPTIMIZED: ", i);
1305
            _PyUOpPrint(&executor->trace[i]);
1306
            printf("\n");
1307
        }
1308
    }
1309
    sanity_check(executor);
1310
#endif
1311
#ifdef _Py_JIT
1312
    executor->jit_code = NULL;
1313
    executor->jit_size = 0;
1314
    // This is initialized to true so we can prevent the executor
1315
    // from being immediately detected as cold and invalidated.
1316
    executor->vm_data.warm = true;
1317
    if (_PyJIT_Compile(executor, executor->trace, length)) {
1318
        Py_DECREF(executor);
1319
        return NULL;
1320
    }
1321
#endif
1322
    _PyObject_GC_TRACK(executor);
1323
    return executor;
1324
}
1325
1326
#ifdef Py_STATS
1327
/* Returns the effective trace length.
1328
 * Ignores NOPs and trailing exit and error handling.*/
1329
int effective_trace_length(_PyUOpInstruction *buffer, int length)
1330
{
1331
    int nop_count = 0;
1332
    for (int i = 0; i < length; i++) {
1333
        int opcode = buffer[i].opcode;
1334
        if (opcode == _NOP) {
1335
            nop_count++;
1336
        }
1337
        if (is_terminator(&buffer[i])) {
1338
            return i+1-nop_count;
1339
        }
1340
    }
1341
    Py_FatalError("No terminating instruction");
1342
    Py_UNREACHABLE();
1343
}
1344
#endif
1345
1346
static int
1347
uop_optimize(
1348
    _PyInterpreterFrame *frame,
1349
    PyThreadState *tstate,
1350
    _PyExecutorObject **exec_ptr,
1351
    bool progress_needed)
1352
{
1353
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1354
    _PyBloomFilter *dependencies = &_tstate->jit_tracer_state.prev_state.dependencies;
1355
    _PyUOpInstruction *buffer = _tstate->jit_tracer_state.code_buffer;
1356
    OPT_STAT_INC(attempts);
1357
    char *env_var = Py_GETENV("PYTHON_UOPS_OPTIMIZE");
1358
    bool is_noopt = true;
1359
    if (env_var == NULL || *env_var == '\0' || *env_var > '0') {
1360
        is_noopt = false;
1361
    }
1362
    int curr_stackentries = _tstate->jit_tracer_state.initial_state.stack_depth;
1363
    int length = _tstate->jit_tracer_state.prev_state.code_curr_size;
1364
    if (length <= CODE_SIZE_NO_PROGRESS) {
1365
        return 0;
1366
    }
1367
    assert(length > 0);
1368
    assert(length < UOP_MAX_TRACE_LENGTH);
1369
    OPT_STAT_INC(traces_created);
1370
    if (!is_noopt) {
1371
        length = _Py_uop_analyze_and_optimize(
1372
            _tstate->jit_tracer_state.initial_state.func,
1373
            buffer,length,
1374
            curr_stackentries, dependencies);
1375
        if (length <= 0) {
1376
            return length;
1377
        }
1378
    }
1379
    assert(length < UOP_MAX_TRACE_LENGTH);
1380
    assert(length >= 1);
1381
    /* Fix up */
1382
    for (int pc = 0; pc < length; pc++) {
1383
        int opcode = buffer[pc].opcode;
1384
        int oparg = buffer[pc].oparg;
1385
        if (oparg < _PyUop_Replication[opcode].stop && oparg >= _PyUop_Replication[opcode].start) {
1386
            buffer[pc].opcode = opcode + oparg + 1 - _PyUop_Replication[opcode].start;
1387
            assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0);
1388
        }
1389
        else if (is_terminator(&buffer[pc])) {
1390
            break;
1391
        }
1392
        assert(_PyOpcode_uop_name[buffer[pc].opcode]);
1393
    }
1394
    OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
1395
    length = prepare_for_execution(buffer, length);
1396
    assert(length <= UOP_MAX_TRACE_LENGTH);
1397
    _PyExecutorObject *executor = make_executor_from_uops(
1398
        buffer, length, dependencies, _tstate->jit_tracer_state.initial_state.chain_depth);
1399
    if (executor == NULL) {
1400
        return -1;
1401
    }
1402
    assert(length <= UOP_MAX_TRACE_LENGTH);
1403
1404
    // Check executor coldness
1405
    // It's okay if this ends up going negative.
1406
    if (--tstate->interp->executor_creation_counter == 0) {
1407
        _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
1408
    }
1409
1410
    *exec_ptr = executor;
1411
    return 1;
1412
}
1413
1414
1415
/*****************************************
1416
 *        Executor management
1417
 ****************************************/
1418
1419
/* We use a bloomfilter with k = 6, m = 256
1420
 * The choice of k and the following constants
1421
 * could do with a more rigorous analysis,
1422
 * but here is a simple analysis:
1423
 *
1424
 * We want to keep the false positive rate low.
1425
 * For n = 5 (a trace depends on 5 objects),
1426
 * we expect 30 bits set, giving a false positive
1427
 * rate of (30/256)**6 == 2.5e-6 which is plenty
1428
 * good enough.
1429
 *
1430
 * However with n = 10 we expect 60 bits set (worst case),
1431
 * giving a false positive of (60/256)**6 == 0.0001
1432
 *
1433
 * We choose k = 6, rather than a higher number as
1434
 * it means the false positive rate grows slower for high n.
1435
 *
1436
 * n = 5, k = 6 => fp = 2.6e-6
1437
 * n = 5, k = 8 => fp = 3.5e-7
1438
 * n = 10, k = 6 => fp = 1.6e-4
1439
 * n = 10, k = 8 => fp = 0.9e-4
1440
 * n = 15, k = 6 => fp = 0.18%
1441
 * n = 15, k = 8 => fp = 0.23%
1442
 * n = 20, k = 6 => fp = 1.1%
1443
 * n = 20, k = 8 => fp = 2.3%
1444
 *
1445
 * The above analysis assumes perfect hash functions,
1446
 * but those don't exist, so the real false positive
1447
 * rates may be worse.
1448
 */
1449
1450
#define K 6
1451
1452
#define SEED 20221211
1453
1454
/* TO DO -- Use more modern hash functions with better distribution of bits */
1455
static uint64_t
1456
address_to_hash(void *ptr) {
1457
    assert(ptr != NULL);
1458
    uint64_t uhash = SEED;
1459
    uintptr_t addr = (uintptr_t)ptr;
1460
    for (int i = 0; i < SIZEOF_VOID_P; i++) {
1461
        uhash ^= addr & 255;
1462
        uhash *= (uint64_t)PyHASH_MULTIPLIER;
1463
        addr >>= 8;
1464
    }
1465
    return uhash;
1466
}
1467
1468
void
1469
_Py_BloomFilter_Init(_PyBloomFilter *bloom)
1470
{
1471
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1472
        bloom->bits[i] = 0;
1473
    }
1474
}
1475
1476
/* We want K hash functions that each set 1 bit.
1477
 * A hash function that sets 1 bit in M bits can be trivially
1478
 * derived from a log2(M) bit hash function.
1479
 * So we extract 8 (log2(256)) bits at a time from
1480
 * the 64bit hash. */
1481
void
1482
_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr)
1483
{
1484
    uint64_t hash = address_to_hash(ptr);
1485
    assert(K <= 8);
1486
    for (int i = 0; i < K; i++) {
1487
        uint8_t bits = hash & 255;
1488
        bloom->bits[bits >> 5] |= (1 << (bits&31));
1489
        hash >>= 8;
1490
    }
1491
}
1492
1493
static bool
1494
bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
1495
{
1496
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1497
        if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) {
1498
            return false;
1499
        }
1500
    }
1501
    return true;
1502
}
1503
1504
static void
1505
link_executor(_PyExecutorObject *executor)
1506
{
1507
    PyInterpreterState *interp = _PyInterpreterState_GET();
1508
    _PyExecutorLinkListNode *links = &executor->vm_data.links;
1509
    _PyExecutorObject *head = interp->executor_list_head;
1510
    if (head == NULL) {
1511
        interp->executor_list_head = executor;
1512
        links->previous = NULL;
1513
        links->next = NULL;
1514
    }
1515
    else {
1516
        assert(head->vm_data.links.previous == NULL);
1517
        links->previous = NULL;
1518
        links->next = head;
1519
        head->vm_data.links.previous = executor;
1520
        interp->executor_list_head = executor;
1521
    }
1522
    executor->vm_data.linked = true;
1523
    /* executor_list_head must be first in list */
1524
    assert(interp->executor_list_head->vm_data.links.previous == NULL);
1525
}
1526
1527
static void
1528
unlink_executor(_PyExecutorObject *executor)
1529
{
1530
    if (!executor->vm_data.linked) {
1531
        return;
1532
    }
1533
    _PyExecutorLinkListNode *links = &executor->vm_data.links;
1534
    assert(executor->vm_data.valid);
1535
    _PyExecutorObject *next = links->next;
1536
    _PyExecutorObject *prev = links->previous;
1537
    if (next != NULL) {
1538
        next->vm_data.links.previous = prev;
1539
    }
1540
    if (prev != NULL) {
1541
        prev->vm_data.links.next = next;
1542
    }
1543
    else {
1544
        // prev == NULL implies that executor is the list head
1545
        PyInterpreterState *interp = PyInterpreterState_Get();
1546
        assert(interp->executor_list_head == executor);
1547
        interp->executor_list_head = next;
1548
    }
1549
    executor->vm_data.linked = false;
1550
}
1551
1552
/* This must be called by optimizers before using the executor */
1553
void
1554
_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
1555
{
1556
    executor->vm_data.valid = true;
1557
    for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1558
        executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
1559
    }
1560
    link_executor(executor);
1561
}
1562
1563
_PyExecutorObject *
1564
_PyExecutor_GetColdExecutor(void)
1565
{
1566
    PyInterpreterState *interp = _PyInterpreterState_GET();
1567
    if (interp->cold_executor != NULL) {
1568
        return interp->cold_executor;
1569
    }
1570
    _PyExecutorObject *cold = allocate_executor(0, 1);
1571
    if (cold == NULL) {
1572
        Py_FatalError("Cannot allocate core JIT code");
1573
    }
1574
    ((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT;
1575
#ifdef _Py_JIT
1576
    cold->jit_code = NULL;
1577
    cold->jit_size = 0;
1578
    // This is initialized to true so we can prevent the executor
1579
    // from being immediately detected as cold and invalidated.
1580
    cold->vm_data.warm = true;
1581
    if (_PyJIT_Compile(cold, cold->trace, 1)) {
1582
        Py_DECREF(cold);
1583
        Py_FatalError("Cannot allocate core JIT code");
1584
    }
1585
#endif
1586
    _Py_SetImmortal((PyObject *)cold);
1587
    interp->cold_executor = cold;
1588
    return cold;
1589
}
1590
1591
_PyExecutorObject *
1592
_PyExecutor_GetColdDynamicExecutor(void)
1593
{
1594
    PyInterpreterState *interp = _PyInterpreterState_GET();
1595
    if (interp->cold_dynamic_executor != NULL) {
1596
        assert(interp->cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT);
1597
        return interp->cold_dynamic_executor;
1598
    }
1599
    _PyExecutorObject *cold = allocate_executor(0, 1);
1600
    if (cold == NULL) {
1601
        Py_FatalError("Cannot allocate core JIT code");
1602
    }
1603
    ((_PyUOpInstruction *)cold->trace)->opcode = _COLD_DYNAMIC_EXIT;
1604
#ifdef _Py_JIT
1605
    cold->jit_code = NULL;
1606
    cold->jit_size = 0;
1607
    // This is initialized to true so we can prevent the executor
1608
    // from being immediately detected as cold and invalidated.
1609
    cold->vm_data.warm = true;
1610
    if (_PyJIT_Compile(cold, cold->trace, 1)) {
1611
        Py_DECREF(cold);
1612
        Py_FatalError("Cannot allocate core JIT code");
1613
    }
1614
#endif
1615
    _Py_SetImmortal((PyObject *)cold);
1616
    interp->cold_dynamic_executor = cold;
1617
    return cold;
1618
}
1619
1620
void
1621
_PyExecutor_ClearExit(_PyExitData *exit)
1622
{
1623
    if (exit == NULL) {
1624
        return;
1625
    }
1626
    _PyExecutorObject *old = exit->executor;
1627
    if (exit->is_dynamic) {
1628
        exit->executor = _PyExecutor_GetColdDynamicExecutor();
1629
    }
1630
    else {
1631
        exit->executor = _PyExecutor_GetColdExecutor();
1632
    }
1633
    Py_DECREF(old);
1634
}
1635
1636
/* Detaches the executor from the code object (if any) that
1637
 * holds a reference to it */
1638
void
1639
_Py_ExecutorDetach(_PyExecutorObject *executor)
1640
{
1641
    PyCodeObject *code = executor->vm_data.code;
1642
    if (code == NULL) {
1643
        return;
1644
    }
1645
    _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
1646
    assert(instruction->op.code == ENTER_EXECUTOR);
1647
    int index = instruction->op.arg;
1648
    assert(code->co_executors->executors[index] == executor);
1649
    instruction->op.code = executor->vm_data.opcode;
1650
    instruction->op.arg = executor->vm_data.oparg;
1651
    executor->vm_data.code = NULL;
1652
    code->co_executors->executors[index] = NULL;
1653
    Py_DECREF(executor);
1654
}
1655
1656
static int
1657
executor_clear(PyObject *op)
1658
{
1659
    _PyExecutorObject *executor = _PyExecutorObject_CAST(op);
1660
    if (!executor->vm_data.valid) {
1661
        return 0;
1662
    }
1663
    assert(executor->vm_data.valid == 1);
1664
    unlink_executor(executor);
1665
    executor->vm_data.valid = 0;
1666
1667
    /* It is possible for an executor to form a reference
1668
     * cycle with itself, so decref'ing a side exit could
1669
     * free the executor unless we hold a strong reference to it
1670
     */
1671
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
1672
    Py_INCREF(executor);
1673
    for (uint32_t i = 0; i < executor->exit_count; i++) {
1674
        executor->exits[i].temperature = initial_unreachable_backoff_counter();
1675
        _PyExecutorObject *e = executor->exits[i].executor;
1676
        executor->exits[i].executor = cold;
1677
        Py_DECREF(e);
1678
    }
1679
    _Py_ExecutorDetach(executor);
1680
    Py_DECREF(executor);
1681
    return 0;
1682
}
1683
1684
void
1685
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
1686
{
1687
    assert(executor->vm_data.valid);
1688
    _Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
1689
}
1690
1691
/* Invalidate all executors that depend on `obj`
1692
 * May cause other executors to be invalidated as well
1693
 */
1694
void
1695
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
1696
{
1697
    _PyBloomFilter obj_filter;
1698
    _Py_BloomFilter_Init(&obj_filter);
1699
    _Py_BloomFilter_Add(&obj_filter, obj);
1700
    /* Walk the list of executors */
1701
    /* TO DO -- Use a tree to avoid traversing as many objects */
1702
    PyObject *invalidate = PyList_New(0);
1703
    if (invalidate == NULL) {
1704
        goto error;
1705
    }
1706
    /* Clearing an executor can deallocate others, so we need to make a list of
1707
     * executors to invalidate first */
1708
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1709
        assert(exec->vm_data.valid);
1710
        _PyExecutorObject *next = exec->vm_data.links.next;
1711
        if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) &&
1712
            PyList_Append(invalidate, (PyObject *)exec))
1713
        {
1714
            goto error;
1715
        }
1716
        exec = next;
1717
    }
1718
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1719
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1720
        executor_clear(exec);
1721
        if (is_invalidation) {
1722
            OPT_STAT_INC(executors_invalidated);
1723
        }
1724
    }
1725
    Py_DECREF(invalidate);
1726
    return;
1727
error:
1728
    PyErr_Clear();
1729
    Py_XDECREF(invalidate);
1730
    // If we're truly out of memory, wiping out everything is a fine fallback:
1731
    _Py_Executors_InvalidateAll(interp, is_invalidation);
1732
}
1733
1734
void
1735
_PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj)
1736
{
1737
    _PyBloomFilter obj_filter;
1738
    _Py_BloomFilter_Init(&obj_filter);
1739
    _Py_BloomFilter_Add(&obj_filter, obj);
1740
    _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
1741
    if (bloom_filter_may_contain(&_tstate->jit_tracer_state.prev_state.dependencies, &obj_filter))
1742
    {
1743
        _tstate->jit_tracer_state.prev_state.dependencies_still_valid = false;
1744
    }
1745
}
1746
/* Invalidate all executors */
1747
void
1748
_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
1749
{
1750
    while (interp->executor_list_head) {
1751
        _PyExecutorObject *executor = interp->executor_list_head;
1752
        assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1);
1753
        if (executor->vm_data.code) {
1754
            // Clear the entire code object so its co_executors array be freed:
1755
            _PyCode_Clear_Executors(executor->vm_data.code);
1756
        }
1757
        else {
1758
            executor_clear((PyObject *)executor);
1759
        }
1760
        if (is_invalidation) {
1761
            OPT_STAT_INC(executors_invalidated);
1762
        }
1763
    }
1764
}
1765
1766
void
1767
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
1768
{
1769
    /* Walk the list of executors */
1770
    /* TO DO -- Use a tree to avoid traversing as many objects */
1771
    PyObject *invalidate = PyList_New(0);
1772
    if (invalidate == NULL) {
1773
        goto error;
1774
    }
1775
1776
    /* Clearing an executor can deallocate others, so we need to make a list of
1777
     * executors to invalidate first */
1778
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1779
        assert(exec->vm_data.valid);
1780
        _PyExecutorObject *next = exec->vm_data.links.next;
1781
1782
        if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) {
1783
            goto error;
1784
        }
1785
        else {
1786
            exec->vm_data.warm = false;
1787
        }
1788
1789
        exec = next;
1790
    }
1791
    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1792
        PyObject *exec = PyList_GET_ITEM(invalidate, i);
1793
        executor_clear(exec);
1794
    }
1795
    Py_DECREF(invalidate);
1796
    return;
1797
error:
1798
    PyErr_Clear();
1799
    Py_XDECREF(invalidate);
1800
    // If we're truly out of memory, wiping out everything is a fine fallback
1801
    _Py_Executors_InvalidateAll(interp, 0);
1802
}
1803
1804
static void
1805
write_str(PyObject *str, FILE *out)
1806
{
1807
    // Encode the Unicode object to the specified encoding
1808
    PyObject *encoded_obj = PyUnicode_AsEncodedString(str, "utf8", "strict");
1809
    if (encoded_obj == NULL) {
1810
        PyErr_Clear();
1811
        return;
1812
    }
1813
    const char *encoded_str = PyBytes_AsString(encoded_obj);
1814
    Py_ssize_t encoded_size = PyBytes_Size(encoded_obj);
1815
    fwrite(encoded_str, 1, encoded_size, out);
1816
    Py_DECREF(encoded_obj);
1817
}
1818
1819
static int
1820
find_line_number(PyCodeObject *code, _PyExecutorObject *executor)
1821
{
1822
    int code_len = (int)Py_SIZE(code);
1823
    for (int i = 0; i < code_len; i++) {
1824
        _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
1825
        int opcode = instr->op.code;
1826
        if (opcode == ENTER_EXECUTOR) {
1827
            _PyExecutorObject *exec = code->co_executors->executors[instr->op.arg];
1828
            if (exec == executor) {
1829
                return PyCode_Addr2Line(code, i*2);
1830
            }
1831
        }
1832
        i += _PyOpcode_Caches[_Py_GetBaseCodeUnit(code, i).op.code];
1833
    }
1834
    return -1;
1835
}
1836
1837
/* Writes the node and outgoing edges for a single tracelet in graphviz format.
1838
 * Each tracelet is presented as a table of the uops it contains.
1839
 * If Py_STATS is enabled, execution counts are included.
1840
 *
1841
 * https://graphviz.readthedocs.io/en/stable/manual.html
1842
 * https://graphviz.org/gallery/
1843
 */
1844
static void
1845
executor_to_gv(_PyExecutorObject *executor, FILE *out)
1846
{
1847
    PyCodeObject *code = executor->vm_data.code;
1848
    fprintf(out, "executor_%p [\n", executor);
1849
    fprintf(out, "    shape = none\n");
1850
1851
    /* Write the HTML table for the uops */
1852
    fprintf(out, "    label = <<table border=\"0\" cellspacing=\"0\">\n");
1853
    fprintf(out, "        <tr><td port=\"start\" border=\"1\" ><b>Executor</b></td></tr>\n");
1854
    if (code == NULL) {
1855
        fprintf(out, "        <tr><td border=\"1\" >No code object</td></tr>\n");
1856
    }
1857
    else {
1858
        fprintf(out, "        <tr><td  border=\"1\" >");
1859
        write_str(code->co_qualname, out);
1860
        int line = find_line_number(code, executor);
1861
        fprintf(out, ": %d</td></tr>\n", line);
1862
    }
1863
    for (uint32_t i = 0; i < executor->code_size; i++) {
1864
        /* Write row for uop.
1865
         * The `port` is a marker so that outgoing edges can
1866
         * be placed correctly. If a row is marked `port=17`,
1867
         * then the outgoing edge is `{EXEC_NAME}:17 -> {TARGET}`
1868
         * https://graphviz.readthedocs.io/en/stable/manual.html#node-ports-compass
1869
         */
1870
        _PyUOpInstruction const *inst = &executor->trace[i];
1871
        const char *opname = _PyOpcode_uop_name[inst->opcode];
1872
#ifdef Py_STATS
1873
        fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" >%s -- %" PRIu64 "</td></tr>\n", i, opname, inst->execution_count);
1874
#else
1875
        fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" >%s op0=%" PRIu64 "</td></tr>\n", i, opname, inst->operand0);
1876
#endif
1877
        if (inst->opcode == _EXIT_TRACE || inst->opcode == _JUMP_TO_TOP) {
1878
            break;
1879
        }
1880
    }
1881
    fprintf(out, "    </table>>\n");
1882
    fprintf(out, "]\n\n");
1883
1884
    /* Write all the outgoing edges */
1885
    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
1886
    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
1887
    for (uint32_t i = 0; i < executor->code_size; i++) {
1888
        _PyUOpInstruction const *inst = &executor->trace[i];
1889
        uint16_t flags = _PyUop_Flags[inst->opcode];
1890
        _PyExitData *exit = NULL;
1891
        if (inst->opcode == _EXIT_TRACE) {
1892
            exit = (_PyExitData *)inst->operand0;
1893
        }
1894
        else if (flags & HAS_EXIT_FLAG) {
1895
            assert(inst->format == UOP_FORMAT_JUMP);
1896
            _PyUOpInstruction const *exit_inst = &executor->trace[inst->jump_target];
1897
            assert(exit_inst->opcode == _EXIT_TRACE || exit_inst->opcode == _DYNAMIC_EXIT);
1898
            exit = (_PyExitData *)exit_inst->operand0;
1899
        }
1900
        if (exit != NULL && exit->executor != cold && exit->executor != cold_dynamic) {
1901
            fprintf(out, "executor_%p:i%d -> executor_%p:start\n", executor, i, exit->executor);
1902
        }
1903
        if (inst->opcode == _EXIT_TRACE || inst->opcode == _JUMP_TO_TOP) {
1904
            break;
1905
        }
1906
    }
1907
}
1908
1909
/* Write the graph of all the live tracelets in graphviz format. */
1910
int
1911
_PyDumpExecutors(FILE *out)
1912
{
1913
    fprintf(out, "digraph ideal {\n\n");
1914
    fprintf(out, "    rankdir = \"LR\"\n\n");
1915
    PyInterpreterState *interp = PyInterpreterState_Get();
1916
    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1917
        executor_to_gv(exec, out);
1918
        exec = exec->vm_data.links.next;
1919
    }
1920
    fprintf(out, "}\n\n");
1921
    return 0;
1922
}
1923
1924
#else
1925
1926
int
1927
_PyDumpExecutors(FILE *out)
1928
0
{
1929
0
    PyErr_SetString(PyExc_NotImplementedError, "No JIT available");
1930
0
    return -1;
1931
0
}
1932
1933
void
1934
_PyExecutor_Free(struct _PyExecutorObject *self)
1935
0
{
1936
    /* This should never be called */
1937
0
    Py_UNREACHABLE();
1938
0
}
1939
1940
#endif /* _Py_TIER2 */