• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Macros and other things needed by ceval.c, and bytecodes.c
2 
3 /* Computed GOTOs, or
4        the-optimization-commonly-but-improperly-known-as-"threaded code"
5    using gcc's labels-as-values extension
6    (http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).
7 
8    The traditional bytecode evaluation loop uses a "switch" statement, which
9    decent compilers will optimize as a single indirect branch instruction
10    combined with a lookup table of jump addresses. However, since the
11    indirect jump instruction is shared by all opcodes, the CPU will have a
12    hard time making the right prediction for where to jump next (actually,
13    it will be always wrong except in the uncommon case of a sequence of
14    several identical opcodes).
15 
16    "Threaded code" in contrast, uses an explicit jump table and an explicit
17    indirect jump instruction at the end of each opcode. Since the jump
18    instruction is at a different address for each opcode, the CPU will make a
19    separate prediction for each of these instructions, which is equivalent to
20    predicting the second opcode of each opcode pair. These predictions have
21    a much better chance to turn out valid, especially in small bytecode loops.
22 
23    A mispredicted branch on a modern CPU flushes the whole pipeline and
24    can cost several CPU cycles (depending on the pipeline depth),
25    and potentially many more instructions (depending on the pipeline width).
26    A correctly predicted branch, however, is nearly free.
27 
28    At the time of this writing, the "threaded code" version is up to 15-20%
29    faster than the normal "switch" version, depending on the compiler and the
30    CPU architecture.
31 
32    NOTE: care must be taken that the compiler doesn't try to "optimize" the
33    indirect jumps by sharing them between all opcodes. Such optimizations
34    can be disabled on gcc by using the -fno-gcse flag (or possibly
35    -fno-crossjumping).
36 */
37 
38 /* Use macros rather than inline functions, to make it as clear as possible
39  * to the C compiler that the tracing check is a simple test then branch.
40  * We want to be sure that the compiler knows this before it generates
41  * the CFG.
42  */
43 
44 #ifdef WITH_DTRACE
45 #define OR_DTRACE_LINE | (PyDTrace_LINE_ENABLED() ? 255 : 0)
46 #else
47 #define OR_DTRACE_LINE
48 #endif
49 
50 #ifdef HAVE_COMPUTED_GOTOS
51     #ifndef USE_COMPUTED_GOTOS
52     #define USE_COMPUTED_GOTOS 1
53     #endif
54 #else
55     #if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
56     #error "Computed gotos are not supported on this compiler."
57     #endif
58     #undef USE_COMPUTED_GOTOS
59     #define USE_COMPUTED_GOTOS 0
60 #endif
61 
62 #ifdef Py_STATS
63 #define INSTRUCTION_STATS(op) \
64     do { \
65         OPCODE_EXE_INC(op); \
66         if (_Py_stats) _Py_stats->opcode_stats[lastopcode].pair_count[op]++; \
67         lastopcode = op; \
68     } while (0)
69 #else
70 #define INSTRUCTION_STATS(op) ((void)0)
71 #endif
72 
73 #if USE_COMPUTED_GOTOS
74 #  define TARGET(op) TARGET_##op:
75 #  define DISPATCH_GOTO() goto *opcode_targets[opcode]
76 #else
77 #  define TARGET(op) case op: TARGET_##op:
78 #  define DISPATCH_GOTO() goto dispatch_opcode
79 #endif
80 
81 /* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
82 #ifdef LLTRACE
83 #define PRE_DISPATCH_GOTO() if (lltrace >= 5) { \
84     lltrace_instruction(frame, stack_pointer, next_instr, opcode, oparg); }
85 #else
86 #define PRE_DISPATCH_GOTO() ((void)0)
87 #endif
88 
89 #if LLTRACE
90 #define LLTRACE_RESUME_FRAME() \
91 do { \
92     lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS()); \
93     if (lltrace < 0) { \
94         goto exit_unwind; \
95     } \
96 } while (0)
97 #else
98 #define LLTRACE_RESUME_FRAME() ((void)0)
99 #endif
100 
101 #ifdef Py_GIL_DISABLED
102 #define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
103 #else
104 #define QSBR_QUIESCENT_STATE(tstate)
105 #endif
106 
107 
108 /* Do interpreter dispatch accounting for tracing and instrumentation */
109 #define DISPATCH() \
110     { \
111         NEXTOPARG(); \
112         PRE_DISPATCH_GOTO(); \
113         DISPATCH_GOTO(); \
114     }
115 
116 #define DISPATCH_SAME_OPARG() \
117     { \
118         opcode = next_instr->op.code; \
119         PRE_DISPATCH_GOTO(); \
120         DISPATCH_GOTO(); \
121     }
122 
123 #define DISPATCH_INLINED(NEW_FRAME)                     \
124     do {                                                \
125         assert(tstate->interp->eval_frame == NULL);     \
126         _PyFrame_SetStackPointer(frame, stack_pointer); \
127         (NEW_FRAME)->previous = frame;                  \
128         frame = tstate->current_frame = (NEW_FRAME);     \
129         CALL_STAT_INC(inlined_py_calls);                \
130         goto start_frame;                               \
131     } while (0)
132 
133 // Use this instead of 'goto error' so Tier 2 can go to a different label
134 #define GOTO_ERROR(LABEL) goto LABEL
135 
136 #define CHECK_EVAL_BREAKER() \
137     _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
138     QSBR_QUIESCENT_STATE(tstate); \
139     if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
140         if (_Py_HandlePending(tstate) != 0) { \
141             GOTO_ERROR(error); \
142         } \
143     }
144 
145 
146 /* Tuple access macros */
147 
148 #ifndef Py_DEBUG
149 #define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
150 #else
151 static inline PyObject *
GETITEM(PyObject * v,Py_ssize_t i)152 GETITEM(PyObject *v, Py_ssize_t i) {
153     assert(PyTuple_Check(v));
154     assert(i >= 0);
155     assert(i < PyTuple_GET_SIZE(v));
156     return PyTuple_GET_ITEM(v, i);
157 }
158 #endif
159 
160 /* Code access macros */
161 
162 /* The integer overflow is checked by an assertion below. */
163 #define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))))
164 #define NEXTOPARG()  do { \
165         _Py_CODEUNIT word  = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
166         opcode = word.op.code; \
167         oparg = word.op.arg; \
168     } while (0)
169 
170 /* JUMPBY makes the generator identify the instruction as a jump. SKIP_OVER is
171  * for advancing to the next instruction, taking into account cache entries
172  * and skipped instructions.
173  */
174 #define JUMPBY(x)       (next_instr += (x))
175 #define SKIP_OVER(x)    (next_instr += (x))
176 
177 /* OpCode prediction macros
178     Some opcodes tend to come in pairs thus making it possible to
179     predict the second code when the first is run.  For example,
180     COMPARE_OP is often followed by POP_JUMP_IF_FALSE or POP_JUMP_IF_TRUE.
181 
182     Verifying the prediction costs a single high-speed test of a register
183     variable against a constant.  If the pairing was good, then the
184     processor's own internal branch predication has a high likelihood of
185     success, resulting in a nearly zero-overhead transition to the
186     next opcode.  A successful prediction saves a trip through the eval-loop
187     including its unpredictable switch-case branch.  Combined with the
188     processor's internal branch prediction, a successful PREDICT has the
189     effect of making the two opcodes run as if they were a single new opcode
190     with the bodies combined.
191 
192     If collecting opcode statistics, your choices are to either keep the
193     predictions turned-on and interpret the results as if some opcodes
194     had been combined or turn-off predictions so that the opcode frequency
195     counter updates for both opcodes.
196 
197     Opcode prediction is disabled with threaded code, since the latter allows
198     the CPU to record separate branch prediction information for each
199     opcode.
200 
201 */
202 
203 #define PREDICT_ID(op)          PRED_##op
204 #define PREDICTED(op)           PREDICT_ID(op):
205 
206 
207 /* Stack manipulation macros */
208 
209 /* The stack can grow at most MAXINT deep, as co_nlocals and
210    co_stacksize are ints. */
211 #define STACK_LEVEL()     ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
212 #define STACK_SIZE()      (_PyFrame_GetCode(frame)->co_stacksize)
213 #define EMPTY()           (STACK_LEVEL() == 0)
214 #define TOP()             (stack_pointer[-1])
215 #define SECOND()          (stack_pointer[-2])
216 #define THIRD()           (stack_pointer[-3])
217 #define FOURTH()          (stack_pointer[-4])
218 #define PEEK(n)           (stack_pointer[-(n)])
219 #define POKE(n, v)        (stack_pointer[-(n)] = (v))
220 #define SET_TOP(v)        (stack_pointer[-1] = (v))
221 #define SET_SECOND(v)     (stack_pointer[-2] = (v))
222 #define BASIC_STACKADJ(n) (stack_pointer += n)
223 #define BASIC_PUSH(v)     (*stack_pointer++ = (v))
224 #define BASIC_POP()       (*--stack_pointer)
225 
226 #ifdef Py_DEBUG
227 #define PUSH(v)         do { \
228                             BASIC_PUSH(v); \
229                             assert(STACK_LEVEL() <= STACK_SIZE()); \
230                         } while (0)
231 #define POP()           (assert(STACK_LEVEL() > 0), BASIC_POP())
232 #define STACK_GROW(n)   do { \
233                             assert(n >= 0); \
234                             BASIC_STACKADJ(n); \
235                             assert(STACK_LEVEL() <= STACK_SIZE()); \
236                         } while (0)
237 #define STACK_SHRINK(n) do { \
238                             assert(n >= 0); \
239                             assert(STACK_LEVEL() >= n); \
240                             BASIC_STACKADJ(-(n)); \
241                         } while (0)
242 #else
243 #define PUSH(v)                BASIC_PUSH(v)
244 #define POP()                  BASIC_POP()
245 #define STACK_GROW(n)          BASIC_STACKADJ(n)
246 #define STACK_SHRINK(n)        BASIC_STACKADJ(-(n))
247 #endif
248 
249 
250 /* Data access macros */
251 #define FRAME_CO_CONSTS (_PyFrame_GetCode(frame)->co_consts)
252 #define FRAME_CO_NAMES  (_PyFrame_GetCode(frame)->co_names)
253 
254 /* Local variable macros */
255 
256 #define LOCALS_ARRAY    (frame->localsplus)
257 #define GETLOCAL(i)     (frame->localsplus[i])
258 
259 /* The SETLOCAL() macro must not DECREF the local variable in-place and
260    then store the new value; it must copy the old value to a temporary
261    value, then store the new value, and then DECREF the temporary value.
262    This is because it is possible that during the DECREF the frame is
263    accessed by other code (e.g. a __del__ method or gc.collect()) and the
264    variable would be pointing to already-freed memory. */
265 #define SETLOCAL(i, value)      do { PyObject *tmp = GETLOCAL(i); \
266                                      GETLOCAL(i) = value; \
267                                      Py_XDECREF(tmp); } while (0)
268 
269 #define GO_TO_INSTRUCTION(op) goto PREDICT_ID(op)
270 
271 #ifdef Py_STATS
272 #define UPDATE_MISS_STATS(INSTNAME)                              \
273     do {                                                         \
274         STAT_INC(opcode, miss);                                  \
275         STAT_INC((INSTNAME), miss);                              \
276         /* The counter is always the first cache entry: */       \
277         if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) {       \
278             STAT_INC((INSTNAME), deopt);                         \
279         }                                                        \
280     } while (0)
281 #else
282 #define UPDATE_MISS_STATS(INSTNAME) ((void)0)
283 #endif
284 
285 #define DEOPT_IF(COND, INSTNAME)                            \
286     if ((COND)) {                                           \
287         /* This is only a single jump on release builds! */ \
288         UPDATE_MISS_STATS((INSTNAME));                      \
289         assert(_PyOpcode_Deopt[opcode] == (INSTNAME));      \
290         GO_TO_INSTRUCTION(INSTNAME);                        \
291     }
292 
293 
294 #define GLOBALS() frame->f_globals
295 #define BUILTINS() frame->f_builtins
296 #define LOCALS() frame->f_locals
297 #define CONSTS() _PyFrame_GetCode(frame)->co_consts
298 #define NAMES() _PyFrame_GetCode(frame)->co_names
299 
300 #define DTRACE_FUNCTION_ENTRY()  \
301     if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
302         dtrace_function_entry(frame); \
303     }
304 
305 /* This takes a uint16_t instead of a _Py_BackoffCounter,
306  * because it is used directly on the cache entry in generated code,
307  * which is always an integral type. */
308 #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
309     backoff_counter_triggers(forge_backoff_counter((COUNTER)))
310 
311 #ifdef Py_GIL_DISABLED
312 #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
313     do { \
314         /* gh-115999 tracks progress on addressing this. */ \
315         static_assert(0, "The specializing interpreter is not yet thread-safe"); \
316     } while (0);
317 #define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER)
318 #else
319 #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
320     do { \
321         (COUNTER) = advance_backoff_counter((COUNTER)); \
322     } while (0);
323 
324 #define PAUSE_ADAPTIVE_COUNTER(COUNTER) \
325     do { \
326         (COUNTER) = pause_backoff_counter((COUNTER)); \
327     } while (0);
328 #endif
329 
330 #define UNBOUNDLOCAL_ERROR_MSG \
331     "cannot access local variable '%s' where it is not associated with a value"
332 #define UNBOUNDFREE_ERROR_MSG \
333     "cannot access free variable '%s' where it is not associated with a value" \
334     " in enclosing scope"
335 #define NAME_ERROR_MSG "name '%.200s' is not defined"
336 
337 #define DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dval, result) \
338 do { \
339     if (Py_REFCNT(left) == 1) { \
340         ((PyFloatObject *)left)->ob_fval = (dval); \
341         _Py_DECREF_SPECIALIZED(right, _PyFloat_ExactDealloc);\
342         result = (left); \
343     } \
344     else if (Py_REFCNT(right) == 1)  {\
345         ((PyFloatObject *)right)->ob_fval = (dval); \
346         _Py_DECREF_NO_DEALLOC(left); \
347         result = (right); \
348     }\
349     else { \
350         result = PyFloat_FromDouble(dval); \
351         if ((result) == NULL) GOTO_ERROR(error); \
352         _Py_DECREF_NO_DEALLOC(left); \
353         _Py_DECREF_NO_DEALLOC(right); \
354     } \
355 } while (0)
356 
357 // If a trace function sets a new f_lineno and
358 // *then* raises, we use the destination when searching
359 // for an exception handler, displaying the traceback, and so on
360 #define INSTRUMENTED_JUMP(src, dest, event) \
361 do { \
362     if (tstate->tracing) {\
363         next_instr = dest; \
364     } else { \
365         _PyFrame_SetStackPointer(frame, stack_pointer); \
366         next_instr = _Py_call_instrumentation_jump(tstate, event, frame, src, dest); \
367         stack_pointer = _PyFrame_GetStackPointer(frame); \
368         if (next_instr == NULL) { \
369             next_instr = (dest)+1; \
370             goto error; \
371         } \
372     } \
373 } while (0);
374 
375 
376 // GH-89279: Force inlining by using a macro.
377 #if defined(_MSC_VER) && SIZEOF_INT == 4
378 #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) (assert(sizeof((ATOMIC_VAL)->_value) == 4), *((volatile int*)&((ATOMIC_VAL)->_value)))
379 #else
380 #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL)
381 #endif
382 
_Py_EnterRecursivePy(PyThreadState * tstate)383 static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
384     return (tstate->py_recursion_remaining-- <= 0) &&
385         _Py_CheckRecursiveCallPy(tstate);
386 }
387 
_Py_LeaveRecursiveCallPy(PyThreadState * tstate)388 static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate)  {
389     tstate->py_recursion_remaining++;
390 }
391 
392 /* Implementation of "macros" that modify the instruction pointer,
393  * stack pointer, or frame pointer.
394  * These need to treated differently by tier 1 and 2.
395  * The Tier 1 version is here; Tier 2 is inlined in ceval.c. */
396 
397 #define LOAD_IP(OFFSET) do { \
398         next_instr = frame->instr_ptr + (OFFSET); \
399     } while (0)
400 
401 /* There's no STORE_IP(), it's inlined by the code generator. */
402 
403 #define LOAD_SP() \
404 stack_pointer = _PyFrame_GetStackPointer(frame);
405 
406 /* Tier-switching macros. */
407 
408 #ifdef _Py_JIT
409 #define GOTO_TIER_TWO(EXECUTOR)                        \
410 do {                                                   \
411     OPT_STAT_INC(traces_executed);                     \
412     jit_func jitted = (EXECUTOR)->jit_code;            \
413     next_instr = jitted(frame, stack_pointer, tstate); \
414     Py_DECREF(tstate->previous_executor);              \
415     tstate->previous_executor = NULL;                  \
416     frame = tstate->current_frame;                     \
417     if (next_instr == NULL) {                          \
418         goto resume_with_error;                        \
419     }                                                  \
420     stack_pointer = _PyFrame_GetStackPointer(frame);   \
421     DISPATCH();                                        \
422 } while (0)
423 #else
424 #define GOTO_TIER_TWO(EXECUTOR) \
425 do { \
426     OPT_STAT_INC(traces_executed); \
427     next_uop = (EXECUTOR)->trace; \
428     assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
429     goto enter_tier_two; \
430 } while (0)
431 #endif
432 
433 #define GOTO_TIER_ONE(TARGET) \
434 do { \
435     Py_DECREF(tstate->previous_executor); \
436     tstate->previous_executor = NULL;  \
437     next_instr = target; \
438     DISPATCH(); \
439 } while (0)
440 
441 #define CURRENT_OPARG() (next_uop[-1].oparg)
442 
443 #define CURRENT_OPERAND() (next_uop[-1].operand)
444 
445 #define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target
446 #define JUMP_TO_ERROR() goto jump_to_error_target
447 #define GOTO_UNWIND() goto error_tier_two
448 #define EXIT_TO_TRACE() goto exit_to_trace
449 #define EXIT_TO_TIER1() goto exit_to_tier1
450 #define EXIT_TO_TIER1_DYNAMIC() goto exit_to_tier1_dynamic;
451