1 // Macros and other things needed by ceval.c, and bytecodes.c
2
3 /* Computed GOTOs, or
4 the-optimization-commonly-but-improperly-known-as-"threaded code"
5 using gcc's labels-as-values extension
6 (http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).
7
8 The traditional bytecode evaluation loop uses a "switch" statement, which
9 decent compilers will optimize as a single indirect branch instruction
10 combined with a lookup table of jump addresses. However, since the
11 indirect jump instruction is shared by all opcodes, the CPU will have a
12 hard time making the right prediction for where to jump next (actually,
13 it will be always wrong except in the uncommon case of a sequence of
14 several identical opcodes).
15
16 "Threaded code" in contrast, uses an explicit jump table and an explicit
17 indirect jump instruction at the end of each opcode. Since the jump
18 instruction is at a different address for each opcode, the CPU will make a
19 separate prediction for each of these instructions, which is equivalent to
20 predicting the second opcode of each opcode pair. These predictions have
21 a much better chance to turn out valid, especially in small bytecode loops.
22
23 A mispredicted branch on a modern CPU flushes the whole pipeline and
24 can cost several CPU cycles (depending on the pipeline depth),
25 and potentially many more instructions (depending on the pipeline width).
26 A correctly predicted branch, however, is nearly free.
27
28 At the time of this writing, the "threaded code" version is up to 15-20%
29 faster than the normal "switch" version, depending on the compiler and the
30 CPU architecture.
31
32 NOTE: care must be taken that the compiler doesn't try to "optimize" the
33 indirect jumps by sharing them between all opcodes. Such optimizations
34 can be disabled on gcc by using the -fno-gcse flag (or possibly
35 -fno-crossjumping).
36 */
37
38 /* Use macros rather than inline functions, to make it as clear as possible
39 * to the C compiler that the tracing check is a simple test then branch.
40 * We want to be sure that the compiler knows this before it generates
41 * the CFG.
42 */
43
44 #ifdef WITH_DTRACE
45 #define OR_DTRACE_LINE | (PyDTrace_LINE_ENABLED() ? 255 : 0)
46 #else
47 #define OR_DTRACE_LINE
48 #endif
49
50 #ifdef HAVE_COMPUTED_GOTOS
51 #ifndef USE_COMPUTED_GOTOS
52 #define USE_COMPUTED_GOTOS 1
53 #endif
54 #else
55 #if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
56 #error "Computed gotos are not supported on this compiler."
57 #endif
58 #undef USE_COMPUTED_GOTOS
59 #define USE_COMPUTED_GOTOS 0
60 #endif
61
62 #ifdef Py_STATS
63 #define INSTRUCTION_STATS(op) \
64 do { \
65 OPCODE_EXE_INC(op); \
66 if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \
67 lastopcode = op; \
68 } while (0)
69 #else
70 #define INSTRUCTION_STATS(op) ((void)0)
71 #endif
72
73 #if USE_COMPUTED_GOTOS
74 # define TARGET(op) TARGET_##op:
75 # define DISPATCH_GOTO() goto *opcode_targets[opcode]
76 #else
77 # define TARGET(op) case op: TARGET_##op:
78 # define DISPATCH_GOTO() goto dispatch_opcode
79 #endif
80
81 /* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
82 #ifdef LLTRACE
83 #define PRE_DISPATCH_GOTO() if (lltrace >= 5) { \
84 lltrace_instruction(frame, stack_pointer, next_instr, opcode, oparg); }
85 #else
86 #define PRE_DISPATCH_GOTO() ((void)0)
87 #endif
88
89 #if LLTRACE
90 #define LLTRACE_RESUME_FRAME() \
91 do { \
92 lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS()); \
93 if (lltrace < 0) { \
94 goto exit_unwind; \
95 } \
96 } while (0)
97 #else
98 #define LLTRACE_RESUME_FRAME() ((void)0)
99 #endif
100
101 #ifdef Py_GIL_DISABLED
102 #define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
103 #else
104 #define QSBR_QUIESCENT_STATE(tstate)
105 #endif
106
107
108 /* Do interpreter dispatch accounting for tracing and instrumentation */
109 #define DISPATCH() \
110 { \
111 NEXTOPARG(); \
112 PRE_DISPATCH_GOTO(); \
113 DISPATCH_GOTO(); \
114 }
115
116 #define DISPATCH_SAME_OPARG() \
117 { \
118 opcode = next_instr->op.code; \
119 PRE_DISPATCH_GOTO(); \
120 DISPATCH_GOTO(); \
121 }
122
123 #define DISPATCH_INLINED(NEW_FRAME) \
124 do { \
125 assert(tstate->interp->eval_frame == NULL); \
126 _PyFrame_SetStackPointer(frame, stack_pointer); \
127 (NEW_FRAME)->previous = frame; \
128 frame = tstate->current_frame = (NEW_FRAME); \
129 CALL_STAT_INC(inlined_py_calls); \
130 goto start_frame; \
131 } while (0)
132
133 // Use this instead of 'goto error' so Tier 2 can go to a different label
134 #define GOTO_ERROR(LABEL) goto LABEL
135
136 #define CHECK_EVAL_BREAKER() \
137 _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
138 QSBR_QUIESCENT_STATE(tstate); \
139 if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
140 if (_Py_HandlePending(tstate) != 0) { \
141 GOTO_ERROR(error); \
142 } \
143 }
144
145
146 /* Tuple access macros */
147
148 #ifndef Py_DEBUG
149 #define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
150 #else
151 static inline PyObject *
GETITEM(PyObject * v,Py_ssize_t i)152 GETITEM(PyObject *v, Py_ssize_t i) {
153 assert(PyTuple_Check(v));
154 assert(i >= 0);
155 assert(i < PyTuple_GET_SIZE(v));
156 return PyTuple_GET_ITEM(v, i);
157 }
158 #endif
159
160 /* Code access macros */
161
162 /* The integer overflow is checked by an assertion below. */
163 #define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))))
164 #define NEXTOPARG() do { \
165 _Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
166 opcode = word.op.code; \
167 oparg = word.op.arg; \
168 } while (0)
169
170 /* JUMPBY makes the generator identify the instruction as a jump. SKIP_OVER is
171 * for advancing to the next instruction, taking into account cache entries
172 * and skipped instructions.
173 */
174 #define JUMPBY(x) (next_instr += (x))
175 #define SKIP_OVER(x) (next_instr += (x))
176
177 /* OpCode prediction macros
178 Some opcodes tend to come in pairs thus making it possible to
179 predict the second code when the first is run. For example,
180 COMPARE_OP is often followed by POP_JUMP_IF_FALSE or POP_JUMP_IF_TRUE.
181
182 Verifying the prediction costs a single high-speed test of a register
183 variable against a constant. If the pairing was good, then the
184 processor's own internal branch predication has a high likelihood of
185 success, resulting in a nearly zero-overhead transition to the
186 next opcode. A successful prediction saves a trip through the eval-loop
187 including its unpredictable switch-case branch. Combined with the
188 processor's internal branch prediction, a successful PREDICT has the
189 effect of making the two opcodes run as if they were a single new opcode
190 with the bodies combined.
191
192 If collecting opcode statistics, your choices are to either keep the
193 predictions turned-on and interpret the results as if some opcodes
194 had been combined or turn-off predictions so that the opcode frequency
195 counter updates for both opcodes.
196
197 Opcode prediction is disabled with threaded code, since the latter allows
198 the CPU to record separate branch prediction information for each
199 opcode.
200
201 */
202
203 #define PREDICT_ID(op) PRED_##op
204 #define PREDICTED(op) PREDICT_ID(op):
205
206
207 /* Stack manipulation macros */
208
209 /* The stack can grow at most MAXINT deep, as co_nlocals and
210 co_stacksize are ints. */
211 #define STACK_LEVEL() ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
212 #define STACK_SIZE() (_PyFrame_GetCode(frame)->co_stacksize)
213 #define EMPTY() (STACK_LEVEL() == 0)
214 #define TOP() (stack_pointer[-1])
215 #define SECOND() (stack_pointer[-2])
216 #define THIRD() (stack_pointer[-3])
217 #define FOURTH() (stack_pointer[-4])
218 #define PEEK(n) (stack_pointer[-(n)])
219 #define POKE(n, v) (stack_pointer[-(n)] = (v))
220 #define SET_TOP(v) (stack_pointer[-1] = (v))
221 #define SET_SECOND(v) (stack_pointer[-2] = (v))
222 #define BASIC_STACKADJ(n) (stack_pointer += n)
223 #define BASIC_PUSH(v) (*stack_pointer++ = (v))
224 #define BASIC_POP() (*--stack_pointer)
225
226 #ifdef Py_DEBUG
227 #define PUSH(v) do { \
228 BASIC_PUSH(v); \
229 assert(STACK_LEVEL() <= STACK_SIZE()); \
230 } while (0)
231 #define POP() (assert(STACK_LEVEL() > 0), BASIC_POP())
232 #define STACK_GROW(n) do { \
233 assert(n >= 0); \
234 BASIC_STACKADJ(n); \
235 assert(STACK_LEVEL() <= STACK_SIZE()); \
236 } while (0)
237 #define STACK_SHRINK(n) do { \
238 assert(n >= 0); \
239 assert(STACK_LEVEL() >= n); \
240 BASIC_STACKADJ(-(n)); \
241 } while (0)
242 #else
243 #define PUSH(v) BASIC_PUSH(v)
244 #define POP() BASIC_POP()
245 #define STACK_GROW(n) BASIC_STACKADJ(n)
246 #define STACK_SHRINK(n) BASIC_STACKADJ(-(n))
247 #endif
248
249
250 /* Data access macros */
251 #define FRAME_CO_CONSTS (_PyFrame_GetCode(frame)->co_consts)
252 #define FRAME_CO_NAMES (_PyFrame_GetCode(frame)->co_names)
253
254 /* Local variable macros */
255
256 #define LOCALS_ARRAY (frame->localsplus)
257 #define GETLOCAL(i) (frame->localsplus[i])
258
259 /* The SETLOCAL() macro must not DECREF the local variable in-place and
260 then store the new value; it must copy the old value to a temporary
261 value, then store the new value, and then DECREF the temporary value.
262 This is because it is possible that during the DECREF the frame is
263 accessed by other code (e.g. a __del__ method or gc.collect()) and the
264 variable would be pointing to already-freed memory. */
265 #define SETLOCAL(i, value) do { PyObject *tmp = GETLOCAL(i); \
266 GETLOCAL(i) = value; \
267 Py_XDECREF(tmp); } while (0)
268
269 #define GO_TO_INSTRUCTION(op) goto PREDICT_ID(op)
270
271 #ifdef Py_STATS
272 #define UPDATE_MISS_STATS(INSTNAME) \
273 do { \
274 STAT_INC(opcode, miss); \
275 STAT_INC((INSTNAME), miss); \
276 /* The counter is always the first cache entry: */ \
277 if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) { \
278 STAT_INC((INSTNAME), deopt); \
279 } \
280 } while (0)
281 #else
282 #define UPDATE_MISS_STATS(INSTNAME) ((void)0)
283 #endif
284
285 #define DEOPT_IF(COND, INSTNAME) \
286 if ((COND)) { \
287 /* This is only a single jump on release builds! */ \
288 UPDATE_MISS_STATS((INSTNAME)); \
289 assert(_PyOpcode_Deopt[opcode] == (INSTNAME)); \
290 GO_TO_INSTRUCTION(INSTNAME); \
291 }
292
293
294 #define GLOBALS() frame->f_globals
295 #define BUILTINS() frame->f_builtins
296 #define LOCALS() frame->f_locals
297 #define CONSTS() _PyFrame_GetCode(frame)->co_consts
298 #define NAMES() _PyFrame_GetCode(frame)->co_names
299
300 #define DTRACE_FUNCTION_ENTRY() \
301 if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
302 dtrace_function_entry(frame); \
303 }
304
305 /* This takes a uint16_t instead of a _Py_BackoffCounter,
306 * because it is used directly on the cache entry in generated code,
307 * which is always an integral type. */
308 #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
309 backoff_counter_triggers(forge_backoff_counter((COUNTER)))
310
311 #ifdef Py_GIL_DISABLED
312 #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
313 do { \
314 /* gh-115999 tracks progress on addressing this. */ \
315 static_assert(0, "The specializing interpreter is not yet thread-safe"); \
316 } while (0);
317 #define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER)
318 #else
319 #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
320 do { \
321 (COUNTER) = advance_backoff_counter((COUNTER)); \
322 } while (0);
323
324 #define PAUSE_ADAPTIVE_COUNTER(COUNTER) \
325 do { \
326 (COUNTER) = pause_backoff_counter((COUNTER)); \
327 } while (0);
328 #endif
329
330 #define UNBOUNDLOCAL_ERROR_MSG \
331 "cannot access local variable '%s' where it is not associated with a value"
332 #define UNBOUNDFREE_ERROR_MSG \
333 "cannot access free variable '%s' where it is not associated with a value" \
334 " in enclosing scope"
335 #define NAME_ERROR_MSG "name '%.200s' is not defined"
336
337 #define DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dval, result) \
338 do { \
339 if (Py_REFCNT(left) == 1) { \
340 ((PyFloatObject *)left)->ob_fval = (dval); \
341 _Py_DECREF_SPECIALIZED(right, _PyFloat_ExactDealloc);\
342 result = (left); \
343 } \
344 else if (Py_REFCNT(right) == 1) {\
345 ((PyFloatObject *)right)->ob_fval = (dval); \
346 _Py_DECREF_NO_DEALLOC(left); \
347 result = (right); \
348 }\
349 else { \
350 result = PyFloat_FromDouble(dval); \
351 if ((result) == NULL) GOTO_ERROR(error); \
352 _Py_DECREF_NO_DEALLOC(left); \
353 _Py_DECREF_NO_DEALLOC(right); \
354 } \
355 } while (0)
356
357 // If a trace function sets a new f_lineno and
358 // *then* raises, we use the destination when searching
359 // for an exception handler, displaying the traceback, and so on
360 #define INSTRUMENTED_JUMP(src, dest, event) \
361 do { \
362 if (tstate->tracing) {\
363 next_instr = dest; \
364 } else { \
365 _PyFrame_SetStackPointer(frame, stack_pointer); \
366 next_instr = _Py_call_instrumentation_jump(tstate, event, frame, src, dest); \
367 stack_pointer = _PyFrame_GetStackPointer(frame); \
368 if (next_instr == NULL) { \
369 next_instr = (dest)+1; \
370 goto error; \
371 } \
372 } \
373 } while (0);
374
375
376 // GH-89279: Force inlining by using a macro.
377 #if defined(_MSC_VER) && SIZEOF_INT == 4
378 #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) (assert(sizeof((ATOMIC_VAL)->_value) == 4), *((volatile int*)&((ATOMIC_VAL)->_value)))
379 #else
380 #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL)
381 #endif
382
_Py_EnterRecursivePy(PyThreadState * tstate)383 static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
384 return (tstate->py_recursion_remaining-- <= 0) &&
385 _Py_CheckRecursiveCallPy(tstate);
386 }
387
_Py_LeaveRecursiveCallPy(PyThreadState * tstate)388 static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) {
389 tstate->py_recursion_remaining++;
390 }
391
392 /* Implementation of "macros" that modify the instruction pointer,
393 * stack pointer, or frame pointer.
394 * These need to treated differently by tier 1 and 2.
395 * The Tier 1 version is here; Tier 2 is inlined in ceval.c. */
396
397 #define LOAD_IP(OFFSET) do { \
398 next_instr = frame->instr_ptr + (OFFSET); \
399 } while (0)
400
401 /* There's no STORE_IP(), it's inlined by the code generator. */
402
403 #define LOAD_SP() \
404 stack_pointer = _PyFrame_GetStackPointer(frame);
405
406 /* Tier-switching macros. */
407
408 #ifdef _Py_JIT
409 #define GOTO_TIER_TWO(EXECUTOR) \
410 do { \
411 OPT_STAT_INC(traces_executed); \
412 jit_func jitted = (EXECUTOR)->jit_code; \
413 next_instr = jitted(frame, stack_pointer, tstate); \
414 Py_DECREF(tstate->previous_executor); \
415 tstate->previous_executor = NULL; \
416 frame = tstate->current_frame; \
417 if (next_instr == NULL) { \
418 goto resume_with_error; \
419 } \
420 stack_pointer = _PyFrame_GetStackPointer(frame); \
421 DISPATCH(); \
422 } while (0)
423 #else
424 #define GOTO_TIER_TWO(EXECUTOR) \
425 do { \
426 OPT_STAT_INC(traces_executed); \
427 next_uop = (EXECUTOR)->trace; \
428 assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
429 goto enter_tier_two; \
430 } while (0)
431 #endif
432
433 #define GOTO_TIER_ONE(TARGET) \
434 do { \
435 Py_DECREF(tstate->previous_executor); \
436 tstate->previous_executor = NULL; \
437 next_instr = target; \
438 DISPATCH(); \
439 } while (0)
440
441 #define CURRENT_OPARG() (next_uop[-1].oparg)
442
443 #define CURRENT_OPERAND() (next_uop[-1].operand)
444
445 #define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target
446 #define JUMP_TO_ERROR() goto jump_to_error_target
447 #define GOTO_UNWIND() goto error_tier_two
448 #define EXIT_TO_TRACE() goto exit_to_trace
449 #define EXIT_TO_TIER1() goto exit_to_tier1
450 #define EXIT_TO_TIER1_DYNAMIC() goto exit_to_tier1_dynamic;
451