/src/cpython/Python/ceval_macros.h

Source (jump to first uncovered line)
// Macros and other things needed by ceval.c, and bytecodes.c

/* Computed GOTOs, or
       the-optimization-commonly-but-improperly-known-as-"threaded code"
   using gcc's labels-as-values extension
   (http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).

   The traditional bytecode evaluation loop uses a "switch" statement, which
   decent compilers will optimize as a single indirect branch instruction
   combined with a lookup table of jump addresses. However, since the
   indirect jump instruction is shared by all opcodes, the CPU will have a
   hard time making the right prediction for where to jump next (actually,
   it will be always wrong except in the uncommon case of a sequence of
   several identical opcodes).

   "Threaded code" in contrast, uses an explicit jump table and an explicit
   indirect jump instruction at the end of each opcode. Since the jump
   instruction is at a different address for each opcode, the CPU will make a
   separate prediction for each of these instructions, which is equivalent to
   predicting the second opcode of each opcode pair. These predictions have
   a much better chance to turn out valid, especially in small bytecode loops.

   A mispredicted branch on a modern CPU flushes the whole pipeline and
   can cost several CPU cycles (depending on the pipeline depth),
   and potentially many more instructions (depending on the pipeline width).
   A correctly predicted branch, however, is nearly free.

   At the time of this writing, the "threaded code" version is up to 15-20%
   faster than the normal "switch" version, depending on the compiler and the
   CPU architecture.

   NOTE: care must be taken that the compiler doesn't try to "optimize" the
   indirect jumps by sharing them between all opcodes. Such optimizations
   can be disabled on gcc by using the -fno-gcse flag (or possibly
   -fno-crossjumping).
*/

/* Use macros rather than inline functions, to make it as clear as possible
 * to the C compiler that the tracing check is a simple test then branch.
 * We want to be sure that the compiler knows this before it generates
 * the CFG.
 */

#ifdef WITH_DTRACE
#define OR_DTRACE_LINE | (PyDTrace_LINE_ENABLED() ? 255 : 0)
#else
#define OR_DTRACE_LINE
#endif

#ifdef HAVE_COMPUTED_GOTOS
    #ifndef USE_COMPUTED_GOTOS
    #define USE_COMPUTED_GOTOS 1
    #endif
#else
    #if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
    #error "Computed gotos are not supported on this compiler."
    #endif
    #undef USE_COMPUTED_GOTOS
    #define USE_COMPUTED_GOTOS 0
#endif

#ifdef Py_STATS
#define INSTRUCTION_STATS(op) \
    do { \
        OPCODE_EXE_INC(op); \
        if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \
        lastopcode = op; \
    } while (0)
#else
#define INSTRUCTION_STATS(op) ((void)0)
#endif

#ifdef Py_STATS
#   define TAIL_CALL_PARAMS _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate, _Py_CODEUNIT *next_instr, int oparg, int lastopcode
#   define TAIL_CALL_ARGS frame, stack_pointer, tstate, next_instr, oparg, lastopcode
#else
#   define TAIL_CALL_PARAMS _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate, _Py_CODEUNIT *next_instr, int oparg
#   define TAIL_CALL_ARGS frame, stack_pointer, tstate, next_instr, oparg
#endif

#if Py_TAIL_CALL_INTERP
    // Note: [[clang::musttail]] works for GCC 15, but not __attribute__((musttail)) at the moment.
#   define Py_MUSTTAIL [[clang::musttail]]
#   define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
    Py_PRESERVE_NONE_CC typedef PyObject* (*py_tail_call_funcptr)(TAIL_CALL_PARAMS);

#   define TARGET(op) Py_PRESERVE_NONE_CC PyObject *_TAIL_CALL_##op(TAIL_CALL_PARAMS)
#   define DISPATCH_GOTO() \
        do { \
            Py_MUSTTAIL return (INSTRUCTION_TABLE[opcode])(TAIL_CALL_ARGS); \
        } while (0)
#   define JUMP_TO_LABEL(name) \
        do { \
            Py_MUSTTAIL return (_TAIL_CALL_##name)(TAIL_CALL_ARGS); \
        } while (0)
#   ifdef Py_STATS
#       define JUMP_TO_PREDICTED(name) \
            do { \
                Py_MUSTTAIL return (_TAIL_CALL_##name)(frame, stack_pointer, tstate, this_instr, oparg, lastopcode); \
            } while (0)
#   else
#       define JUMP_TO_PREDICTED(name) \
            do { \
                Py_MUSTTAIL return (_TAIL_CALL_##name)(frame, stack_pointer, tstate, this_instr, oparg); \
            } while (0)
#   endif
#    define LABEL(name) TARGET(name)
#elif USE_COMPUTED_GOTOS
#  define TARGET(op) TARGET_##op:
#  define DISPATCH_GOTO() goto *opcode_targets[opcode]
#  define JUMP_TO_LABEL(name) goto name;
#  define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
#  define LABEL(name) name:
#else
#  define TARGET(op) case op: TARGET_##op:
#  define DISPATCH_GOTO() goto dispatch_opcode
#  define JUMP_TO_LABEL(name) goto name;
#  define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
#  define LABEL(name) name:
#endif

/* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
#ifdef Py_DEBUG
#define PRE_DISPATCH_GOTO() if (frame->lltrace >= 5) { \
    lltrace_instruction(frame, stack_pointer, next_instr, opcode, oparg); }
#else
#define PRE_DISPATCH_GOTO() ((void)0)
#endif

#ifdef Py_DEBUG
#define LLTRACE_RESUME_FRAME() \
do { \
    _PyFrame_SetStackPointer(frame, stack_pointer); \
    int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \
    stack_pointer = _PyFrame_GetStackPointer(frame); \
    frame->lltrace = lltrace; \
} while (0)
#else
#define LLTRACE_RESUME_FRAME() ((void)0)
#endif

#ifdef Py_GIL_DISABLED
#define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
#else
#define QSBR_QUIESCENT_STATE(tstate)
#endif


/* Do interpreter dispatch accounting for tracing and instrumentation */
#define DISPATCH() \
    { \
        assert(frame->stackpointer == NULL); \
        NEXTOPARG(); \
        PRE_DISPATCH_GOTO(); \
        DISPATCH_GOTO(); \
    }

#define DISPATCH_SAME_OPARG() \
    { \
        opcode = next_instr->op.code; \
        PRE_DISPATCH_GOTO(); \
        DISPATCH_GOTO(); \
    }

#define DISPATCH_INLINED(NEW_FRAME)                     \
    do {                                                \
        assert(tstate->interp->eval_frame == NULL);     \
        _PyFrame_SetStackPointer(frame, stack_pointer); \
        assert((NEW_FRAME)->previous == frame);         \
        frame = tstate->current_frame = (NEW_FRAME);     \
        CALL_STAT_INC(inlined_py_calls);                \
        JUMP_TO_LABEL(start_frame);                      \
    } while (0)

/* Tuple access macros */

#ifndef Py_DEBUG
#define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
#else
static inline PyObject *
GETITEM(PyObject *v, Py_ssize_t i) {
    assert(PyTuple_Check(v));
    assert(i >= 0);
    assert(i < PyTuple_GET_SIZE(v));
    return PyTuple_GET_ITEM(v, i);
}
#endif

/* Code access macros */

/* The integer overflow is checked by an assertion below. */
#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
#define NEXTOPARG()  do { \
        _Py_CODEUNIT word  = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
        opcode = word.op.code; \
        oparg = word.op.arg; \
    } while (0)

/* JUMPBY makes the generator identify the instruction as a jump. SKIP_OVER is
 * for advancing to the next instruction, taking into account cache entries
 * and skipped instructions.
 */
#define JUMPBY(x)       (next_instr += (x))
#define SKIP_OVER(x)    (next_instr += (x))

#define STACK_LEVEL()     ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
#define STACK_SIZE()      (_PyFrame_GetCode(frame)->co_stacksize)

#define WITHIN_STACK_BOUNDS() \
   (frame->owner == FRAME_OWNED_BY_INTERPRETER || (STACK_LEVEL() >= 0 && STACK_LEVEL() <= STACK_SIZE()))

/* Data access macros */
#define FRAME_CO_CONSTS (_PyFrame_GetCode(frame)->co_consts)
#define FRAME_CO_NAMES  (_PyFrame_GetCode(frame)->co_names)

/* Local variable macros */

#define LOCALS_ARRAY    (frame->localsplus)
#define GETLOCAL(i)     (frame->localsplus[i])


#ifdef Py_STATS
#define UPDATE_MISS_STATS(INSTNAME)                              \
    do {                                                         \
        STAT_INC(opcode, miss);                                  \
        STAT_INC((INSTNAME), miss);                              \
        /* The counter is always the first cache entry: */       \
        if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) {       \
            STAT_INC((INSTNAME), deopt);                         \
        }                                                        \
    } while (0)
#else
#define UPDATE_MISS_STATS(INSTNAME) ((void)0)
#endif


// Try to lock an object in the free threading build, if it's not already
// locked. Use with a DEOPT_IF() to deopt if the object is already locked.
// These are no-ops in the default GIL build. The general pattern is:
//
// DEOPT_IF(!LOCK_OBJECT(op));
// if (/* condition fails */) {
//     UNLOCK_OBJECT(op);
//     DEOPT_IF(true);
//  }
//  ...
//  UNLOCK_OBJECT(op);
//
// NOTE: The object must be unlocked on every exit code path and you should
// avoid any potentially escaping calls (like PyStackRef_CLOSE) while the
// object is locked.
#ifdef Py_GIL_DISABLED
#  define LOCK_OBJECT(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex)
#  define UNLOCK_OBJECT(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex)
#else
#  define LOCK_OBJECT(op) (1)
#  define UNLOCK_OBJECT(op) ((void)0)
#endif

#define GLOBALS() frame->f_globals
#define BUILTINS() frame->f_builtins
#define LOCALS() frame->f_locals
#define CONSTS() _PyFrame_GetCode(frame)->co_consts
#define NAMES() _PyFrame_GetCode(frame)->co_names

#define DTRACE_FUNCTION_ENTRY()  \
    if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
        dtrace_function_entry(frame); \
    }

/* This takes a uint16_t instead of a _Py_BackoffCounter,
 * because it is used directly on the cache entry in generated code,
 * which is always an integral type. */
#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
    backoff_counter_triggers(forge_backoff_counter((COUNTER)))

#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
    do { \
        (COUNTER) = advance_backoff_counter((COUNTER)); \
    } while (0);

#define PAUSE_ADAPTIVE_COUNTER(COUNTER) \
    do { \
        (COUNTER) = pause_backoff_counter((COUNTER)); \
    } while (0);

#ifdef ENABLE_SPECIALIZATION_FT
/* Multiple threads may execute these concurrently if thread-local bytecode is
 * disabled and they all execute the main copy of the bytecode. Specialization
 * is disabled in that case so the value is unused, but the RMW cycle should be
 * free of data races.
 */
#define RECORD_BRANCH_TAKEN(bitset, flag) \
    FT_ATOMIC_STORE_UINT16_RELAXED(       \
        bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag))
#else
#define RECORD_BRANCH_TAKEN(bitset, flag)
#endif

#define UNBOUNDLOCAL_ERROR_MSG \
    "cannot access local variable '%s' where it is not associated with a value"
#define UNBOUNDFREE_ERROR_MSG \
    "cannot access free variable '%s' where it is not associated with a value" \
    " in enclosing scope"
#define NAME_ERROR_MSG "name '%.200s' is not defined"

// If a trace function sets a new f_lineno and
// *then* raises, we use the destination when searching
// for an exception handler, displaying the traceback, and so on
#define INSTRUMENTED_JUMP(src, dest, event) \
do { \
    if (tstate->tracing) {\
        next_instr = dest; \
    } else { \
        _PyFrame_SetStackPointer(frame, stack_pointer); \
        next_instr = _Py_call_instrumentation_jump(this_instr, tstate, event, frame, src, dest); \
        stack_pointer = _PyFrame_GetStackPointer(frame); \
        if (next_instr == NULL) { \
            next_instr = (dest)+1; \
            JUMP_TO_LABEL(error); \
        } \
    } \
} while (0);


static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
    return (tstate->py_recursion_remaining-- <= 0) &&
        _Py_CheckRecursiveCallPy(tstate);
}

static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate)  {
    tstate->py_recursion_remaining++;
}

/* Implementation of "macros" that modify the instruction pointer,
 * stack pointer, or frame pointer.
 * These need to treated differently by tier 1 and 2.
 * The Tier 1 version is here; Tier 2 is inlined in ceval.c. */

#define LOAD_IP(OFFSET) do { \
        next_instr = frame->instr_ptr + (OFFSET); \
    } while (0)

/* There's no STORE_IP(), it's inlined by the code generator. */

#define LOAD_SP() \
stack_pointer = _PyFrame_GetStackPointer(frame)

#define SAVE_SP() \
_PyFrame_SetStackPointer(frame, stack_pointer)

/* Tier-switching macros. */

#define TIER1_TO_TIER2(EXECUTOR)                        \
do {                                                   \
    OPT_STAT_INC(traces_executed);                     \
    next_instr = _Py_jit_entry((EXECUTOR), frame, stack_pointer, tstate); \
    frame = tstate->current_frame;                     \
    stack_pointer = _PyFrame_GetStackPointer(frame);   \
    if (next_instr == NULL) {                          \
        next_instr = frame->instr_ptr;                 \
        JUMP_TO_LABEL(error);                          \
    }                                                  \
    DISPATCH();                                        \
} while (0)

#define TIER2_TO_TIER2(EXECUTOR) \
do {                                                   \
    OPT_STAT_INC(traces_executed);                     \
    current_executor = (EXECUTOR);                     \
    goto tier2_start;                                  \
} while (0)

#define GOTO_TIER_ONE(TARGET)                                         \
    do                                                                \
    {                                                                 \
        tstate->current_executor = NULL;                              \
        OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
        _PyFrame_SetStackPointer(frame, stack_pointer);               \
        return TARGET;                                                \
    } while (0)

#define CURRENT_OPARG()    (next_uop[-1].oparg)
#define CURRENT_OPERAND0() (next_uop[-1].operand0)
#define CURRENT_OPERAND1() (next_uop[-1].operand1)
#define CURRENT_TARGET()   (next_uop[-1].target)

#define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target
#define JUMP_TO_ERROR() goto jump_to_error_target

/* Stackref macros */

/* How much scratch space to give stackref to PyObject* conversion. */
#define MAX_STACKREF_SCRATCH 10

#define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \
    /* +1 because vectorcall might use -1 to write self */ \
    PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \
    PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1);

#define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \
    /* +1 because we +1 previously */ \
    _PyObjectArray_Free(NAME - 1, NAME##_temp);

#define CONVERSION_FAILED(NAME) ((NAME) == NULL)

static inline int
check_periodics(PyThreadState *tstate) {
    _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY();
    QSBR_QUIESCENT_STATE(tstate);
    if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) {
        return _Py_HandlePending(tstate);
    }
    return 0;
}


Coverage Report

Created: 2025-08-26 06:26

Line	Count	Source (jump to first uncovered line)
1		// Macros and other things needed by ceval.c, and bytecodes.c
2
3		/* Computed GOTOs, or
4		the-optimization-commonly-but-improperly-known-as-"threaded code"
5		using gcc's labels-as-values extension
6		(http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).
7
8		The traditional bytecode evaluation loop uses a "switch" statement, which
9		decent compilers will optimize as a single indirect branch instruction
10		combined with a lookup table of jump addresses. However, since the
11		indirect jump instruction is shared by all opcodes, the CPU will have a
12		hard time making the right prediction for where to jump next (actually,
13		it will be always wrong except in the uncommon case of a sequence of
14		several identical opcodes).
15
16		"Threaded code" in contrast, uses an explicit jump table and an explicit
17		indirect jump instruction at the end of each opcode. Since the jump
18		instruction is at a different address for each opcode, the CPU will make a
19		separate prediction for each of these instructions, which is equivalent to
20		predicting the second opcode of each opcode pair. These predictions have
21		a much better chance to turn out valid, especially in small bytecode loops.
22
23		A mispredicted branch on a modern CPU flushes the whole pipeline and
24		can cost several CPU cycles (depending on the pipeline depth),
25		and potentially many more instructions (depending on the pipeline width).
26		A correctly predicted branch, however, is nearly free.
27
28		At the time of this writing, the "threaded code" version is up to 15-20%
29		faster than the normal "switch" version, depending on the compiler and the
30		CPU architecture.
31
32		NOTE: care must be taken that the compiler doesn't try to "optimize" the
33		indirect jumps by sharing them between all opcodes. Such optimizations
34		can be disabled on gcc by using the -fno-gcse flag (or possibly
35		-fno-crossjumping).
36		*/
37
38		/* Use macros rather than inline functions, to make it as clear as possible
39		* to the C compiler that the tracing check is a simple test then branch.
40		* We want to be sure that the compiler knows this before it generates
41		* the CFG.
42		*/
43
44		#ifdef WITH_DTRACE
45		#define OR_DTRACE_LINE \| (PyDTrace_LINE_ENABLED() ? 255 : 0)
46		#else
47		#define OR_DTRACE_LINE
48		#endif
49
50		#ifdef HAVE_COMPUTED_GOTOS
51		#ifndef USE_COMPUTED_GOTOS
52		#define USE_COMPUTED_GOTOS 1
53		#endif
54		#else
55		#if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
56		#error "Computed gotos are not supported on this compiler."
57		#endif
58		#undef USE_COMPUTED_GOTOS
59		#define USE_COMPUTED_GOTOS 0
60		#endif
61
62		#ifdef Py_STATS
63		#define INSTRUCTION_STATS(op) \
64		do { \
65		OPCODE_EXE_INC(op); \
66		if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \
67		lastopcode = op; \
68		} while (0)
69		#else
70	38.0G	#define INSTRUCTION_STATS(op) ((void)0)
71		#endif
72
73		#ifdef Py_STATS
74		# define TAIL_CALL_PARAMS _PyInterpreterFrame frame, _PyStackRef stack_pointer, PyThreadState tstate, _Py_CODEUNIT next_instr, int oparg, int lastopcode
75		# define TAIL_CALL_ARGS frame, stack_pointer, tstate, next_instr, oparg, lastopcode
76		#else
77		# define TAIL_CALL_PARAMS _PyInterpreterFrame frame, _PyStackRef stack_pointer, PyThreadState tstate, _Py_CODEUNIT next_instr, int oparg
78		# define TAIL_CALL_ARGS frame, stack_pointer, tstate, next_instr, oparg
79		#endif
80
81		#if Py_TAIL_CALL_INTERP
82		// Note: [[clang::musttail]] works for GCC 15, but not __attribute__((musttail)) at the moment.
83		# define Py_MUSTTAIL [[clang::musttail]]
84		# define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
85		Py_PRESERVE_NONE_CC typedef PyObject* (*py_tail_call_funcptr)(TAIL_CALL_PARAMS);
86
87		# define TARGET(op) Py_PRESERVE_NONE_CC PyObject *_TAIL_CALL_##op(TAIL_CALL_PARAMS)
88		# define DISPATCH_GOTO() \
89		do { \
90		Py_MUSTTAIL return (INSTRUCTION_TABLE[opcode])(TAIL_CALL_ARGS); \
91		} while (0)
92		# define JUMP_TO_LABEL(name) \
93		do { \
94		Py_MUSTTAIL return (_TAIL_CALL_##name)(TAIL_CALL_ARGS); \
95		} while (0)
96		# ifdef Py_STATS
97		# define JUMP_TO_PREDICTED(name) \
98		do { \
99		Py_MUSTTAIL return (_TAIL_CALL_##name)(frame, stack_pointer, tstate, this_instr, oparg, lastopcode); \
100		} while (0)
101		# else
102		# define JUMP_TO_PREDICTED(name) \
103		do { \
104		Py_MUSTTAIL return (_TAIL_CALL_##name)(frame, stack_pointer, tstate, this_instr, oparg); \
105		} while (0)
106		# endif
107		# define LABEL(name) TARGET(name)
108		#elif USE_COMPUTED_GOTOS
109	38.0G	# define TARGET(op) TARGET_##op:
110	38.3G	# define DISPATCH_GOTO() goto *opcode_targets[opcode]
111	43.5M	# define JUMP_TO_LABEL(name) goto name;
112	209M	# define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
113	308M	# define LABEL(name) name:
114		#else
115		# define TARGET(op) case op: TARGET_##op:
116		# define DISPATCH_GOTO() goto dispatch_opcode
117		# define JUMP_TO_LABEL(name) goto name;
118		# define JUMP_TO_PREDICTED(name) goto PREDICTED_##name;
119		# define LABEL(name) name:
120		#endif
121
122		/* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
123		#ifdef Py_DEBUG
124		#define PRE_DISPATCH_GOTO() if (frame->lltrace >= 5) { \
125		lltrace_instruction(frame, stack_pointer, next_instr, opcode, oparg); }
126		#else
127	38.3G	#define PRE_DISPATCH_GOTO() ((void)0)
128		#endif
129
130		#ifdef Py_DEBUG
131		#define LLTRACE_RESUME_FRAME() \
132		do { \
133		_PyFrame_SetStackPointer(frame, stack_pointer); \
134		int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \
135		stack_pointer = _PyFrame_GetStackPointer(frame); \
136		frame->lltrace = lltrace; \
137		} while (0)
138		#else
139	1.26G	#define LLTRACE_RESUME_FRAME() ((void)0)
140		#endif
141
142		#ifdef Py_GIL_DISABLED
143		#define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
144		#else
145		#define QSBR_QUIESCENT_STATE(tstate)
146		#endif
147
148
149		/* Do interpreter dispatch accounting for tracing and instrumentation */
150		#define DISPATCH() \
151	38.3G	{ \
152	38.3G	assert(frame->stackpointer == NULL); \
153	38.3G	NEXTOPARG(); \
154	38.3G	PRE_DISPATCH_GOTO(); \
155	38.3G	DISPATCH_GOTO(); \
156	38.3G	}
157
158		#define DISPATCH_SAME_OPARG() \
159	4.37M	{ \
160	4.37M	opcode = next_instr->op.code; \
161	4.37M	PRE_DISPATCH_GOTO(); \
162	4.37M	DISPATCH_GOTO(); \
163	4.37M	}
164
165		#define DISPATCH_INLINED(NEW_FRAME) \
166	848k	do { \
167	848k	assert(tstate->interp->eval_frame == NULL); \
168	848k	_PyFrame_SetStackPointer(frame, stack_pointer); \
169	848k	assert((NEW_FRAME)->previous == frame); \
170	848k	frame = tstate->current_frame = (NEW_FRAME); \
171	848k	CALL_STAT_INC(inlined_py_calls); \
172	848k	JUMP_TO_LABEL(start_frame); \
173	0	} while (0)
174
175		/* Tuple access macros */
176
177		#ifndef Py_DEBUG
178	1.66G	#define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
179		#else
180		static inline PyObject *
181		GETITEM(PyObject *v, Py_ssize_t i) {
182		assert(PyTuple_Check(v));
183		assert(i >= 0);
184		assert(i < PyTuple_GET_SIZE(v));
185		return PyTuple_GET_ITEM(v, i);
186		}
187		#endif
188
189		/* Code access macros */
190
191		/* The integer overflow is checked by an assertion below. */
192	31.7M	#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
193	38.3G	#define NEXTOPARG() do { \
194	38.3G	_Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED((uint16_t)next_instr)}; \
195	38.3G	opcode = word.op.code; \
196	38.3G	oparg = word.op.arg; \
197	38.3G	} while (0)
198
199		/* JUMPBY makes the generator identify the instruction as a jump. SKIP_OVER is
200		* for advancing to the next instruction, taking into account cache entries
201		* and skipped instructions.
202		*/
203	5.37G	#define JUMPBY(x) (next_instr += (x))
204	363M	#define SKIP_OVER(x) (next_instr += (x))
205
206		#define STACK_LEVEL() ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
207		#define STACK_SIZE() (_PyFrame_GetCode(frame)->co_stacksize)
208
209		#define WITHIN_STACK_BOUNDS() \
210		(frame->owner == FRAME_OWNED_BY_INTERPRETER \|\| (STACK_LEVEL() >= 0 && STACK_LEVEL() <= STACK_SIZE()))
211
212		/* Data access macros */
213		#define FRAME_CO_CONSTS (_PyFrame_GetCode(frame)->co_consts)
214		#define FRAME_CO_NAMES (_PyFrame_GetCode(frame)->co_names)
215
216		/* Local variable macros */
217
218	1.15M	#define LOCALS_ARRAY (frame->localsplus)
219	18.6G	#define GETLOCAL(i) (frame->localsplus[i])
220
221
222		#ifdef Py_STATS
223		#define UPDATE_MISS_STATS(INSTNAME) \
224		do { \
225		STAT_INC(opcode, miss); \
226		STAT_INC((INSTNAME), miss); \
227		/* The counter is always the first cache entry: */ \
228		if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) { \
229		STAT_INC((INSTNAME), deopt); \
230		} \
231		} while (0)
232		#else
233	209M	#define UPDATE_MISS_STATS(INSTNAME) ((void)0)
234		#endif
235
236
237		// Try to lock an object in the free threading build, if it's not already
238		// locked. Use with a DEOPT_IF() to deopt if the object is already locked.
239		// These are no-ops in the default GIL build. The general pattern is:
240		//
241		// DEOPT_IF(!LOCK_OBJECT(op));
242		// if (/* condition fails */) {
243		// UNLOCK_OBJECT(op);
244		// DEOPT_IF(true);
245		// }
246		// ...
247		// UNLOCK_OBJECT(op);
248		//
249		// NOTE: The object must be unlocked on every exit code path and you should
250		// avoid any potentially escaping calls (like PyStackRef_CLOSE) while the
251		// object is locked.
252		#ifdef Py_GIL_DISABLED
253		# define LOCK_OBJECT(op) PyMutex_LockFast(&(_PyObject_CAST(op))->ob_mutex)
254		# define UNLOCK_OBJECT(op) PyMutex_Unlock(&(_PyObject_CAST(op))->ob_mutex)
255		#else
256	495M	# define LOCK_OBJECT(op) (1)
257	495M	# define UNLOCK_OBJECT(op) ((void)0)
258		#endif
259
260	695M	#define GLOBALS() frame->f_globals
261	376M	#define BUILTINS() frame->f_builtins
262	77.5k	#define LOCALS() frame->f_locals
263		#define CONSTS() _PyFrame_GetCode(frame)->co_consts
264		#define NAMES() _PyFrame_GetCode(frame)->co_names
265
266		#define DTRACE_FUNCTION_ENTRY() \
267		if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
268		dtrace_function_entry(frame); \
269		}
270
271		/* This takes a uint16_t instead of a _Py_BackoffCounter,
272		* because it is used directly on the cache entry in generated code,
273		* which is always an integral type. */
274		#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
275	1.23G	backoff_counter_triggers(forge_backoff_counter((COUNTER)))
276
277		#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
278	1.23G	do { \
279	1.23G	(COUNTER) = advance_backoff_counter((COUNTER)); \
280	1.23G	} while (0);
281
282		#define PAUSE_ADAPTIVE_COUNTER(COUNTER) \
283	0	do { \
284	0	(COUNTER) = pause_backoff_counter((COUNTER)); \
285	0	} while (0);
286
287		#ifdef ENABLE_SPECIALIZATION_FT
288		/* Multiple threads may execute these concurrently if thread-local bytecode is
289		* disabled and they all execute the main copy of the bytecode. Specialization
290		* is disabled in that case so the value is unused, but the RMW cycle should be
291		* free of data races.
292		*/
293		#define RECORD_BRANCH_TAKEN(bitset, flag) \
294	2.68G	FT_ATOMIC_STORE_UINT16_RELAXED( \
295	2.68G	bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) \| (flag))
296		#else
297		#define RECORD_BRANCH_TAKEN(bitset, flag)
298		#endif
299
300		#define UNBOUNDLOCAL_ERROR_MSG \
301	0	"cannot access local variable '%s' where it is not associated with a value"
302		#define UNBOUNDFREE_ERROR_MSG \
303	0	"cannot access free variable '%s' where it is not associated with a value" \
304	0	" in enclosing scope"
305	1	#define NAME_ERROR_MSG "name '%.200s' is not defined"
306
307		// If a trace function sets a new f_lineno and
308		// then raises, we use the destination when searching
309		// for an exception handler, displaying the traceback, and so on
310	0	#define INSTRUMENTED_JUMP(src, dest, event) \
311	0	do { \
312	0	if (tstate->tracing) {\
313	0	next_instr = dest; \
314	0	} else { \
315	0	_PyFrame_SetStackPointer(frame, stack_pointer); \
316	0	next_instr = _Py_call_instrumentation_jump(this_instr, tstate, event, frame, src, dest); \
317	0	stack_pointer = _PyFrame_GetStackPointer(frame); \
318	0	if (next_instr == NULL) { \
319	0	next_instr = (dest)+1; \
320	0	JUMP_TO_LABEL(error); \
321	0	} \
322	0	} \
323	0	} while (0);
324
325
326	234M	static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
327	234M	return (tstate->py_recursion_remaining-- <= 0) &&
328	234M	_Py_CheckRecursiveCallPy(tstate);
329	234M	}
330
331	643M	static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) {
332	643M	tstate->py_recursion_remaining++;
333	643M	}
334
335		/* Implementation of "macros" that modify the instruction pointer,
336		* stack pointer, or frame pointer.
337		* These need to treated differently by tier 1 and 2.
338		* The Tier 1 version is here; Tier 2 is inlined in ceval.c. */
339
340	1.04G	#define LOAD_IP(OFFSET) do { \
341	1.04G	next_instr = frame->instr_ptr + (OFFSET); \
342	1.04G	} while (0)
343
344		/* There's no STORE_IP(), it's inlined by the code generator. */
345
346	409M	#define LOAD_SP() \
347	409M	stack_pointer = _PyFrame_GetStackPointer(frame)
348
349		#define SAVE_SP() \
350		_PyFrame_SetStackPointer(frame, stack_pointer)
351
352		/* Tier-switching macros. */
353
354		#define TIER1_TO_TIER2(EXECUTOR) \
355		do { \
356		OPT_STAT_INC(traces_executed); \
357		next_instr = _Py_jit_entry((EXECUTOR), frame, stack_pointer, tstate); \
358		frame = tstate->current_frame; \
359		stack_pointer = _PyFrame_GetStackPointer(frame); \
360		if (next_instr == NULL) { \
361		next_instr = frame->instr_ptr; \
362		JUMP_TO_LABEL(error); \
363		} \
364		DISPATCH(); \
365		} while (0)
366
367		#define TIER2_TO_TIER2(EXECUTOR) \
368		do { \
369		OPT_STAT_INC(traces_executed); \
370		current_executor = (EXECUTOR); \
371		goto tier2_start; \
372		} while (0)
373
374		#define GOTO_TIER_ONE(TARGET) \
375		do \
376		{ \
377		tstate->current_executor = NULL; \
378		OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
379		_PyFrame_SetStackPointer(frame, stack_pointer); \
380		return TARGET; \
381		} while (0)
382
383		#define CURRENT_OPARG() (next_uop[-1].oparg)
384		#define CURRENT_OPERAND0() (next_uop[-1].operand0)
385		#define CURRENT_OPERAND1() (next_uop[-1].operand1)
386		#define CURRENT_TARGET() (next_uop[-1].target)
387
388		#define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target
389		#define JUMP_TO_ERROR() goto jump_to_error_target
390
391		/* Stackref macros */
392
393		/* How much scratch space to give stackref to PyObject* conversion. */
394	1.83G	#define MAX_STACKREF_SCRATCH 10
395
396		#define STACKREFS_TO_PYOBJECTS(ARGS, ARG_COUNT, NAME) \
397		/* +1 because vectorcall might use -1 to write self */ \
398	1.83G	PyObject *NAME##_temp[MAX_STACKREF_SCRATCH+1]; \
399	1.83G	PyObject **NAME = _PyObjectArray_FromStackRefArray(ARGS, ARG_COUNT, NAME##_temp + 1);
400
401		#define STACKREFS_TO_PYOBJECTS_CLEANUP(NAME) \
402		/* +1 because we +1 previously */ \
403	1.83G	_PyObjectArray_Free(NAME - 1, NAME##_temp);
404
405	1.83G	#define CONVERSION_FAILED(NAME) ((NAME) == NULL)
406
407		static inline int
408	3.23G	check_periodics(PyThreadState *tstate) {
409	3.23G	_Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY();
410	3.23G	QSBR_QUIESCENT_STATE(tstate);
411	3.23G	if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) {
412	58.3k	return _Py_HandlePending(tstate);
413	58.3k	}
414	3.23G	return 0;
415	3.23G	}
416