1 #ifdef _Py_TIER2
2
3 #include "Python.h"
4 #include "opcode.h"
5 #include "pycore_interp.h"
6 #include "pycore_backoff.h"
7 #include "pycore_bitutils.h" // _Py_popcount32()
8 #include "pycore_object.h" // _PyObject_GC_UNTRACK()
9 #include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
10 #include "pycore_opcode_utils.h" // MAX_REAL_OPCODE
11 #include "pycore_optimizer.h" // _Py_uop_analyze_and_optimize()
12 #include "pycore_pystate.h" // _PyInterpreterState_GET()
13 #include "pycore_uop_ids.h"
14 #include "pycore_jit.h"
15 #include <stdbool.h>
16 #include <stdint.h>
17 #include <stddef.h>
18
19 #define NEED_OPCODE_METADATA
20 #include "pycore_uop_metadata.h" // Uop tables
21 #undef NEED_OPCODE_METADATA
22
23 #define MAX_EXECUTORS_SIZE 256
24
25 #ifdef Py_DEBUG
26 static int
base_opcode(PyCodeObject * code,int offset)27 base_opcode(PyCodeObject *code, int offset)
28 {
29 int opcode = _Py_GetBaseOpcode(code, offset);
30 if (opcode == ENTER_EXECUTOR) {
31 int oparg = _PyCode_CODE(code)[offset].op.arg;
32 _PyExecutorObject *ex = code->co_executors->executors[oparg];
33 return ex->vm_data.opcode;
34 }
35 return opcode;
36 }
37 #endif
38
39 static bool
has_space_for_executor(PyCodeObject * code,_Py_CODEUNIT * instr)40 has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
41 {
42 if (instr->op.code == ENTER_EXECUTOR) {
43 return true;
44 }
45 if (code->co_executors == NULL) {
46 return true;
47 }
48 return code->co_executors->size < MAX_EXECUTORS_SIZE;
49 }
50
51 static int32_t
get_index_for_executor(PyCodeObject * code,_Py_CODEUNIT * instr)52 get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
53 {
54 if (instr->op.code == ENTER_EXECUTOR) {
55 return instr->op.arg;
56 }
57 _PyExecutorArray *old = code->co_executors;
58 int size = 0;
59 int capacity = 0;
60 if (old != NULL) {
61 size = old->size;
62 capacity = old->capacity;
63 assert(size < MAX_EXECUTORS_SIZE);
64 }
65 assert(size <= capacity);
66 if (size == capacity) {
67 /* Array is full. Grow array */
68 int new_capacity = capacity ? capacity * 2 : 4;
69 _PyExecutorArray *new = PyMem_Realloc(
70 old,
71 offsetof(_PyExecutorArray, executors) +
72 new_capacity * sizeof(_PyExecutorObject *));
73 if (new == NULL) {
74 return -1;
75 }
76 new->capacity = new_capacity;
77 new->size = size;
78 code->co_executors = new;
79 }
80 assert(size < code->co_executors->capacity);
81 return size;
82 }
83
84 static void
insert_executor(PyCodeObject * code,_Py_CODEUNIT * instr,int index,_PyExecutorObject * executor)85 insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
86 {
87 Py_INCREF(executor);
88 if (instr->op.code == ENTER_EXECUTOR) {
89 assert(index == instr->op.arg);
90 _Py_ExecutorDetach(code->co_executors->executors[index]);
91 }
92 else {
93 assert(code->co_executors->size == index);
94 assert(code->co_executors->capacity > index);
95 code->co_executors->size++;
96 }
97 executor->vm_data.opcode = instr->op.code;
98 executor->vm_data.oparg = instr->op.arg;
99 executor->vm_data.code = code;
100 executor->vm_data.index = (int)(instr - _PyCode_CODE(code));
101 code->co_executors->executors[index] = executor;
102 assert(index < MAX_EXECUTORS_SIZE);
103 instr->op.code = ENTER_EXECUTOR;
104 instr->op.arg = index;
105 }
106
107
108 static int
never_optimize(_PyOptimizerObject * self,_PyInterpreterFrame * frame,_Py_CODEUNIT * instr,_PyExecutorObject ** exec,int Py_UNUSED (stack_entries))109 never_optimize(
110 _PyOptimizerObject* self,
111 _PyInterpreterFrame *frame,
112 _Py_CODEUNIT *instr,
113 _PyExecutorObject **exec,
114 int Py_UNUSED(stack_entries))
115 {
116 // This may be called if the optimizer is reset
117 return 0;
118 }
119
120 PyTypeObject _PyDefaultOptimizer_Type = {
121 PyVarObject_HEAD_INIT(&PyType_Type, 0)
122 .tp_name = "noop_optimizer",
123 .tp_basicsize = sizeof(_PyOptimizerObject),
124 .tp_itemsize = 0,
125 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
126 };
127
128 static _PyOptimizerObject _PyOptimizer_Default = {
129 PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
130 .optimize = never_optimize,
131 };
132
133 _PyOptimizerObject *
_Py_GetOptimizer(void)134 _Py_GetOptimizer(void)
135 {
136 PyInterpreterState *interp = _PyInterpreterState_GET();
137 if (interp->optimizer == &_PyOptimizer_Default) {
138 return NULL;
139 }
140 Py_INCREF(interp->optimizer);
141 return interp->optimizer;
142 }
143
144 static _PyExecutorObject *
145 make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies);
146
147 static int
148 init_cold_exit_executor(_PyExecutorObject *executor, int oparg);
149
150 /* It is impossible for the number of exits to reach 1/4 of the total length,
151 * as the number of exits cannot reach 1/3 of the number of non-exits, due to
152 * the presence of CHECK_VALIDITY checks and instructions to produce the values
153 * being checked in exits. */
154 #define COLD_EXIT_COUNT (UOP_MAX_TRACE_LENGTH/4)
155
156 static int cold_exits_initialized = 0;
157 static _PyExecutorObject COLD_EXITS[COLD_EXIT_COUNT] = { 0 };
158
159 static const _PyBloomFilter EMPTY_FILTER = { 0 };
160
161 _PyOptimizerObject *
_Py_SetOptimizer(PyInterpreterState * interp,_PyOptimizerObject * optimizer)162 _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
163 {
164 if (optimizer == NULL) {
165 optimizer = &_PyOptimizer_Default;
166 }
167 else if (cold_exits_initialized == 0) {
168 cold_exits_initialized = 1;
169 for (int i = 0; i < COLD_EXIT_COUNT; i++) {
170 if (init_cold_exit_executor(&COLD_EXITS[i], i)) {
171 return NULL;
172 }
173 }
174 }
175 _PyOptimizerObject *old = interp->optimizer;
176 if (old == NULL) {
177 old = &_PyOptimizer_Default;
178 }
179 Py_INCREF(optimizer);
180 interp->optimizer = optimizer;
181 return old;
182 }
183
184 int
_Py_SetTier2Optimizer(_PyOptimizerObject * optimizer)185 _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
186 {
187 PyInterpreterState *interp = _PyInterpreterState_GET();
188 _PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
189 Py_XDECREF(old);
190 return old == NULL ? -1 : 0;
191 }
192
193 /* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
194 * If optimized, *executor_ptr contains a new reference to the executor
195 */
196 int
_PyOptimizer_Optimize(_PyInterpreterFrame * frame,_Py_CODEUNIT * start,PyObject ** stack_pointer,_PyExecutorObject ** executor_ptr)197 _PyOptimizer_Optimize(
198 _PyInterpreterFrame *frame, _Py_CODEUNIT *start,
199 PyObject **stack_pointer, _PyExecutorObject **executor_ptr)
200 {
201 PyCodeObject *code = _PyFrame_GetCode(frame);
202 assert(PyCode_Check(code));
203 PyInterpreterState *interp = _PyInterpreterState_GET();
204 if (!has_space_for_executor(code, start)) {
205 return 0;
206 }
207 _PyOptimizerObject *opt = interp->optimizer;
208 int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
209 if (err <= 0) {
210 return err;
211 }
212 assert(*executor_ptr != NULL);
213 int index = get_index_for_executor(code, start);
214 if (index < 0) {
215 /* Out of memory. Don't raise and assume that the
216 * error will show up elsewhere.
217 *
218 * If an optimizer has already produced an executor,
219 * it might get confused by the executor disappearing,
220 * but there is not much we can do about that here. */
221 Py_DECREF(*executor_ptr);
222 return 0;
223 }
224 insert_executor(code, start, index, *executor_ptr);
225 assert((*executor_ptr)->vm_data.valid);
226 return 1;
227 }
228
229 _PyExecutorObject *
_Py_GetExecutor(PyCodeObject * code,int offset)230 _Py_GetExecutor(PyCodeObject *code, int offset)
231 {
232 int code_len = (int)Py_SIZE(code);
233 for (int i = 0 ; i < code_len;) {
234 if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) {
235 int oparg = _PyCode_CODE(code)[i].op.arg;
236 _PyExecutorObject *res = code->co_executors->executors[oparg];
237 Py_INCREF(res);
238 return res;
239 }
240 i += _PyInstruction_GetLength(code, i);
241 }
242 PyErr_SetString(PyExc_ValueError, "no executor at given byte offset");
243 return NULL;
244 }
245
246 static PyObject *
is_valid(PyObject * self,PyObject * Py_UNUSED (ignored))247 is_valid(PyObject *self, PyObject *Py_UNUSED(ignored))
248 {
249 return PyBool_FromLong(((_PyExecutorObject *)self)->vm_data.valid);
250 }
251
252 static PyObject *
get_opcode(PyObject * self,PyObject * Py_UNUSED (ignored))253 get_opcode(PyObject *self, PyObject *Py_UNUSED(ignored))
254 {
255 return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.opcode);
256 }
257
258 static PyObject *
get_oparg(PyObject * self,PyObject * Py_UNUSED (ignored))259 get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
260 {
261 return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.oparg);
262 }
263
264 static PyMethodDef executor_methods[] = {
265 { "is_valid", is_valid, METH_NOARGS, NULL },
266 { "get_opcode", get_opcode, METH_NOARGS, NULL },
267 { "get_oparg", get_oparg, METH_NOARGS, NULL },
268 { NULL, NULL },
269 };
270
271 ///////////////////// Experimental UOp Optimizer /////////////////////
272
273 static int executor_clear(_PyExecutorObject *executor);
274 static void unlink_executor(_PyExecutorObject *executor);
275
276 static void
uop_dealloc(_PyExecutorObject * self)277 uop_dealloc(_PyExecutorObject *self) {
278 _PyObject_GC_UNTRACK(self);
279 assert(self->vm_data.code == NULL);
280 unlink_executor(self);
281 #ifdef _Py_JIT
282 _PyJIT_Free(self);
283 #endif
284 PyObject_GC_Del(self);
285 }
286
287 const char *
_PyUOpName(int index)288 _PyUOpName(int index)
289 {
290 if (index < 0 || index > MAX_UOP_ID) {
291 return NULL;
292 }
293 return _PyOpcode_uop_name[index];
294 }
295
296 #ifdef Py_DEBUG
297 void
_PyUOpPrint(const _PyUOpInstruction * uop)298 _PyUOpPrint(const _PyUOpInstruction *uop)
299 {
300 const char *name = _PyUOpName(uop->opcode);
301 if (name == NULL) {
302 printf("<uop %d>", uop->opcode);
303 }
304 else {
305 printf("%s", name);
306 }
307 switch(uop->format) {
308 case UOP_FORMAT_TARGET:
309 printf(" (%d, target=%d, operand=%#" PRIx64,
310 uop->oparg,
311 uop->target,
312 (uint64_t)uop->operand);
313 break;
314 case UOP_FORMAT_JUMP:
315 printf(" (%d, jump_target=%d, operand=%#" PRIx64,
316 uop->oparg,
317 uop->jump_target,
318 (uint64_t)uop->operand);
319 break;
320 case UOP_FORMAT_EXIT:
321 printf(" (%d, exit_index=%d, operand=%#" PRIx64,
322 uop->oparg,
323 uop->exit_index,
324 (uint64_t)uop->operand);
325 break;
326 default:
327 printf(" (%d, Unknown format)", uop->oparg);
328 }
329 if (_PyUop_Flags[uop->opcode] & HAS_ERROR_FLAG) {
330 printf(", error_target=%d", uop->error_target);
331 }
332
333 printf(")");
334 }
335 #endif
336
337 static Py_ssize_t
uop_len(_PyExecutorObject * self)338 uop_len(_PyExecutorObject *self)
339 {
340 return self->code_size;
341 }
342
343 static PyObject *
uop_item(_PyExecutorObject * self,Py_ssize_t index)344 uop_item(_PyExecutorObject *self, Py_ssize_t index)
345 {
346 Py_ssize_t len = uop_len(self);
347 if (index < 0 || index >= len) {
348 PyErr_SetNone(PyExc_IndexError);
349 return NULL;
350 }
351 const char *name = _PyUOpName(self->trace[index].opcode);
352 if (name == NULL) {
353 name = "<nil>";
354 }
355 PyObject *oname = _PyUnicode_FromASCII(name, strlen(name));
356 if (oname == NULL) {
357 return NULL;
358 }
359 PyObject *oparg = PyLong_FromUnsignedLong(self->trace[index].oparg);
360 if (oparg == NULL) {
361 Py_DECREF(oname);
362 return NULL;
363 }
364 PyObject *target = PyLong_FromUnsignedLong(self->trace[index].target);
365 if (oparg == NULL) {
366 Py_DECREF(oparg);
367 Py_DECREF(oname);
368 return NULL;
369 }
370 PyObject *operand = PyLong_FromUnsignedLongLong(self->trace[index].operand);
371 if (operand == NULL) {
372 Py_DECREF(target);
373 Py_DECREF(oparg);
374 Py_DECREF(oname);
375 return NULL;
376 }
377 PyObject *args[4] = { oname, oparg, target, operand };
378 return _PyTuple_FromArraySteal(args, 4);
379 }
380
381 PySequenceMethods uop_as_sequence = {
382 .sq_length = (lenfunc)uop_len,
383 .sq_item = (ssizeargfunc)uop_item,
384 };
385
386 static int
executor_traverse(PyObject * o,visitproc visit,void * arg)387 executor_traverse(PyObject *o, visitproc visit, void *arg)
388 {
389 _PyExecutorObject *executor = (_PyExecutorObject *)o;
390 for (uint32_t i = 0; i < executor->exit_count; i++) {
391 Py_VISIT(executor->exits[i].executor);
392 }
393 return 0;
394 }
395
396 static PyObject *
get_jit_code(PyObject * self,PyObject * Py_UNUSED (ignored))397 get_jit_code(PyObject *self, PyObject *Py_UNUSED(ignored))
398 {
399 #ifndef _Py_JIT
400 PyErr_SetString(PyExc_RuntimeError, "JIT support not enabled.");
401 return NULL;
402 #else
403 _PyExecutorObject *executor = (_PyExecutorObject *)self;
404 if (executor->jit_code == NULL || executor->jit_size == 0) {
405 Py_RETURN_NONE;
406 }
407 return PyBytes_FromStringAndSize(executor->jit_code, executor->jit_size);
408 #endif
409 }
410
411 static PyMethodDef uop_executor_methods[] = {
412 { "is_valid", is_valid, METH_NOARGS, NULL },
413 { "get_jit_code", get_jit_code, METH_NOARGS, NULL},
414 { "get_opcode", get_opcode, METH_NOARGS, NULL },
415 { "get_oparg", get_oparg, METH_NOARGS, NULL },
416 { NULL, NULL },
417 };
418
419 static int
executor_is_gc(PyObject * o)420 executor_is_gc(PyObject *o)
421 {
422 return !_Py_IsImmortal(o);
423 }
424
425 PyTypeObject _PyUOpExecutor_Type = {
426 PyVarObject_HEAD_INIT(&PyType_Type, 0)
427 .tp_name = "uop_executor",
428 .tp_basicsize = offsetof(_PyExecutorObject, exits),
429 .tp_itemsize = 1,
430 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
431 .tp_dealloc = (destructor)uop_dealloc,
432 .tp_as_sequence = &uop_as_sequence,
433 .tp_methods = uop_executor_methods,
434 .tp_traverse = executor_traverse,
435 .tp_clear = (inquiry)executor_clear,
436 .tp_is_gc = executor_is_gc,
437 };
438
439 /* TO DO -- Generate these tables */
440 static const uint16_t
441 _PyUOp_Replacements[MAX_UOP_ID + 1] = {
442 [_ITER_JUMP_RANGE] = _GUARD_NOT_EXHAUSTED_RANGE,
443 [_ITER_JUMP_LIST] = _GUARD_NOT_EXHAUSTED_LIST,
444 [_ITER_JUMP_TUPLE] = _GUARD_NOT_EXHAUSTED_TUPLE,
445 [_FOR_ITER] = _FOR_ITER_TIER_TWO,
446 };
447
448 static const uint8_t
449 is_for_iter_test[MAX_UOP_ID + 1] = {
450 [_GUARD_NOT_EXHAUSTED_RANGE] = 1,
451 [_GUARD_NOT_EXHAUSTED_LIST] = 1,
452 [_GUARD_NOT_EXHAUSTED_TUPLE] = 1,
453 [_FOR_ITER_TIER_TWO] = 1,
454 };
455
456 static const uint16_t
457 BRANCH_TO_GUARD[4][2] = {
458 [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_TRUE_POP,
459 [POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_FALSE_POP,
460 [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_FALSE_POP,
461 [POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_TRUE_POP,
462 [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NOT_NONE_POP,
463 [POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NONE_POP,
464 [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NONE_POP,
465 [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
466 };
467
468
469 #define CONFIDENCE_RANGE 1000
470 #define CONFIDENCE_CUTOFF 333
471
472 #ifdef Py_DEBUG
473 #define DPRINTF(level, ...) \
474 if (lltrace >= (level)) { printf(__VA_ARGS__); }
475 #else
476 #define DPRINTF(level, ...)
477 #endif
478
479
480 static inline int
add_to_trace(_PyUOpInstruction * trace,int trace_length,uint16_t opcode,uint16_t oparg,uint64_t operand,uint32_t target)481 add_to_trace(
482 _PyUOpInstruction *trace,
483 int trace_length,
484 uint16_t opcode,
485 uint16_t oparg,
486 uint64_t operand,
487 uint32_t target)
488 {
489 trace[trace_length].opcode = opcode;
490 trace[trace_length].format = UOP_FORMAT_TARGET;
491 trace[trace_length].target = target;
492 trace[trace_length].oparg = oparg;
493 trace[trace_length].operand = operand;
494 return trace_length + 1;
495 }
496
497 #ifdef Py_DEBUG
498 #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
499 assert(trace_length < max_length); \
500 trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
501 if (lltrace >= 2) { \
502 printf("%4d ADD_TO_TRACE: ", trace_length); \
503 _PyUOpPrint(&trace[trace_length-1]); \
504 printf("\n"); \
505 }
506 #else
507 #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
508 assert(trace_length < max_length); \
509 trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET));
510 #endif
511
512 #define INSTR_IP(INSTR, CODE) \
513 ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
514
515 // Reserve space for n uops
516 #define RESERVE_RAW(n, opname) \
517 if (trace_length + (n) > max_length) { \
518 DPRINTF(2, "No room for %s (need %d, got %d)\n", \
519 (opname), (n), max_length - trace_length); \
520 OPT_STAT_INC(trace_too_long); \
521 goto done; \
522 }
523
524 // Reserve space for N uops, plus 3 for _SET_IP, _CHECK_VALIDITY and _EXIT_TRACE
525 #define RESERVE(needed) RESERVE_RAW((needed) + 3, _PyUOpName(opcode))
526
527 // Trace stack operations (used by _PUSH_FRAME, _POP_FRAME)
528 #define TRACE_STACK_PUSH() \
529 if (trace_stack_depth >= TRACE_STACK_SIZE) { \
530 DPRINTF(2, "Trace stack overflow\n"); \
531 OPT_STAT_INC(trace_stack_overflow); \
532 trace_length = 0; \
533 goto done; \
534 } \
535 assert(func == NULL || func->func_code == (PyObject *)code); \
536 trace_stack[trace_stack_depth].func = func; \
537 trace_stack[trace_stack_depth].code = code; \
538 trace_stack[trace_stack_depth].instr = instr; \
539 trace_stack_depth++;
540 #define TRACE_STACK_POP() \
541 if (trace_stack_depth <= 0) { \
542 Py_FatalError("Trace stack underflow\n"); \
543 } \
544 trace_stack_depth--; \
545 func = trace_stack[trace_stack_depth].func; \
546 code = trace_stack[trace_stack_depth].code; \
547 assert(func == NULL || func->func_code == (PyObject *)code); \
548 instr = trace_stack[trace_stack_depth].instr;
549
550 /* Returns the length of the trace on success,
551 * 0 if it failed to produce a worthwhile trace,
552 * and -1 on an error.
553 */
554 static int
translate_bytecode_to_trace(_PyInterpreterFrame * frame,_Py_CODEUNIT * instr,_PyUOpInstruction * trace,int buffer_size,_PyBloomFilter * dependencies)555 translate_bytecode_to_trace(
556 _PyInterpreterFrame *frame,
557 _Py_CODEUNIT *instr,
558 _PyUOpInstruction *trace,
559 int buffer_size,
560 _PyBloomFilter *dependencies)
561 {
562 bool progress_needed = true;
563 PyCodeObject *code = _PyFrame_GetCode(frame);
564 PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
565 assert(PyFunction_Check(func));
566 PyCodeObject *initial_code = code;
567 _Py_BloomFilter_Add(dependencies, initial_code);
568 _Py_CODEUNIT *initial_instr = instr;
569 int trace_length = 0;
570 // Leave space for possible trailing _EXIT_TRACE
571 int max_length = buffer_size-2;
572 struct {
573 PyFunctionObject *func;
574 PyCodeObject *code;
575 _Py_CODEUNIT *instr;
576 } trace_stack[TRACE_STACK_SIZE];
577 int trace_stack_depth = 0;
578 int confidence = CONFIDENCE_RANGE; // Adjusted by branch instructions
579
580 #ifdef Py_DEBUG
581 char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
582 int lltrace = 0;
583 if (python_lltrace != NULL && *python_lltrace >= '0') {
584 lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
585 }
586 #endif
587
588 DPRINTF(2,
589 "Optimizing %s (%s:%d) at byte offset %d\n",
590 PyUnicode_AsUTF8(code->co_qualname),
591 PyUnicode_AsUTF8(code->co_filename),
592 code->co_firstlineno,
593 2 * INSTR_IP(initial_instr, code));
594 ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code));
595 uint32_t target = 0;
596
597 top: // Jump here after _PUSH_FRAME or likely branches
598 for (;;) {
599 target = INSTR_IP(instr, code);
600 // Need space for _DEOPT
601 max_length--;
602
603 uint32_t opcode = instr->op.code;
604 uint32_t oparg = instr->op.arg;
605
606 DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
607
608 if (opcode == ENTER_EXECUTOR) {
609 assert(oparg < 256);
610 _PyExecutorObject *executor = code->co_executors->executors[oparg];
611 opcode = executor->vm_data.opcode;
612 DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]);
613 oparg = executor->vm_data.oparg;
614 }
615
616 if (opcode == EXTENDED_ARG) {
617 instr++;
618 opcode = instr->op.code;
619 oparg = (oparg << 8) | instr->op.arg;
620 if (opcode == EXTENDED_ARG) {
621 instr--;
622 goto done;
623 }
624 }
625 assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
626 RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
627 ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
628
629 /* Special case the first instruction,
630 * so that we can guarantee forward progress */
631 if (progress_needed) {
632 progress_needed = false;
633 if (opcode == JUMP_BACKWARD || opcode == JUMP_BACKWARD_NO_INTERRUPT) {
634 instr += 1 + _PyOpcode_Caches[opcode] - (int32_t)oparg;
635 initial_instr = instr;
636 if (opcode == JUMP_BACKWARD) {
637 ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
638 }
639 continue;
640 }
641 else {
642 if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
643 opcode = _PyOpcode_Deopt[opcode];
644 }
645 assert(!OPCODE_HAS_EXIT(opcode));
646 assert(!OPCODE_HAS_DEOPT(opcode));
647 }
648 }
649
650 if (OPCODE_HAS_EXIT(opcode)) {
651 // Make space for exit code
652 max_length--;
653 }
654 if (OPCODE_HAS_ERROR(opcode)) {
655 // Make space for error code
656 max_length--;
657 }
658 switch (opcode) {
659 case POP_JUMP_IF_NONE:
660 case POP_JUMP_IF_NOT_NONE:
661 case POP_JUMP_IF_FALSE:
662 case POP_JUMP_IF_TRUE:
663 {
664 RESERVE(1);
665 int counter = instr[1].cache;
666 int bitcount = _Py_popcount32(counter);
667 int jump_likely = bitcount > 8;
668 /* If bitcount is 8 (half the jumps were taken), adjust confidence by 50%.
669 If it's 16 or 0 (all or none were taken), adjust by 10%
670 (since the future is still somewhat uncertain).
671 For values in between, adjust proportionally. */
672 if (jump_likely) {
673 confidence = confidence * (bitcount + 2) / 20;
674 }
675 else {
676 confidence = confidence * (18 - bitcount) / 20;
677 }
678 uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_likely];
679 DPRINTF(2, "%d: %s(%d): counter=%04x, bitcount=%d, likely=%d, confidence=%d, uopcode=%s\n",
680 target, _PyOpcode_OpName[opcode], oparg,
681 counter, bitcount, jump_likely, confidence, _PyUOpName(uopcode));
682 if (confidence < CONFIDENCE_CUTOFF) {
683 DPRINTF(2, "Confidence too low (%d < %d)\n", confidence, CONFIDENCE_CUTOFF);
684 OPT_STAT_INC(low_confidence);
685 goto done;
686 }
687 _Py_CODEUNIT *next_instr = instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
688 _Py_CODEUNIT *target_instr = next_instr + oparg;
689 if (jump_likely) {
690 DPRINTF(2, "Jump likely (%04x = %d bits), continue at byte offset %d\n",
691 instr[1].cache, bitcount, 2 * INSTR_IP(target_instr, code));
692 instr = target_instr;
693 ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(next_instr, code));
694 goto top;
695 }
696 ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(target_instr, code));
697 break;
698 }
699
700 case JUMP_BACKWARD:
701 case JUMP_BACKWARD_NO_INTERRUPT:
702 {
703 _Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[opcode] - (int)oparg;
704 if (target == initial_instr) {
705 /* We have looped round to the start */
706 RESERVE(1);
707 ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
708 }
709 else {
710 OPT_STAT_INC(inner_loop);
711 DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n");
712 }
713 goto done;
714 }
715
716 case JUMP_FORWARD:
717 {
718 RESERVE(0);
719 // This will emit two _SET_IP instructions; leave it to the optimizer
720 instr += oparg;
721 break;
722 }
723
724 case RESUME:
725 /* Use a special tier 2 version of RESUME_CHECK to allow traces to
726 * start with RESUME_CHECK */
727 ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
728 break;
729
730 default:
731 {
732 const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
733 if (expansion->nuops > 0) {
734 // Reserve space for nuops (+ _SET_IP + _EXIT_TRACE)
735 int nuops = expansion->nuops;
736 RESERVE(nuops + 1); /* One extra for exit */
737 int16_t last_op = expansion->uops[nuops-1].uop;
738 if (last_op == _POP_FRAME || last_op == _RETURN_GENERATOR || last_op == _YIELD_VALUE) {
739 // Check for trace stack underflow now:
740 // We can't bail e.g. in the middle of
741 // LOAD_CONST + _POP_FRAME.
742 if (trace_stack_depth == 0) {
743 DPRINTF(2, "Trace stack underflow\n");
744 OPT_STAT_INC(trace_stack_underflow);
745 goto done;
746 }
747 }
748 uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM
749 for (int i = 0; i < nuops; i++) {
750 oparg = orig_oparg;
751 uint32_t uop = expansion->uops[i].uop;
752 uint64_t operand = 0;
753 // Add one to account for the actual opcode/oparg pair:
754 int offset = expansion->uops[i].offset + 1;
755 switch (expansion->uops[i].size) {
756 case OPARG_FULL:
757 assert(opcode != JUMP_BACKWARD_NO_INTERRUPT && opcode != JUMP_BACKWARD);
758 break;
759 case OPARG_CACHE_1:
760 operand = read_u16(&instr[offset].cache);
761 break;
762 case OPARG_CACHE_2:
763 operand = read_u32(&instr[offset].cache);
764 break;
765 case OPARG_CACHE_4:
766 operand = read_u64(&instr[offset].cache);
767 break;
768 case OPARG_TOP: // First half of super-instr
769 oparg = orig_oparg >> 4;
770 break;
771 case OPARG_BOTTOM: // Second half of super-instr
772 oparg = orig_oparg & 0xF;
773 break;
774 case OPARG_SAVE_RETURN_OFFSET: // op=_SAVE_RETURN_OFFSET; oparg=return_offset
775 oparg = offset;
776 assert(uop == _SAVE_RETURN_OFFSET);
777 break;
778 case OPARG_REPLACED:
779 uop = _PyUOp_Replacements[uop];
780 assert(uop != 0);
781 #ifdef Py_DEBUG
782 {
783 uint32_t next_inst = target + 1 + INLINE_CACHE_ENTRIES_FOR_ITER + (oparg > 255);
784 uint32_t jump_target = next_inst + oparg;
785 assert(base_opcode(code, jump_target) == END_FOR ||
786 base_opcode(code, jump_target) == INSTRUMENTED_END_FOR);
787 assert(base_opcode(code, jump_target+1) == POP_TOP);
788 }
789 #endif
790 break;
791 default:
792 fprintf(stderr,
793 "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n",
794 opcode, oparg, nuops, i,
795 expansion->uops[i].size,
796 expansion->uops[i].offset);
797 Py_FatalError("garbled expansion");
798 }
799
800 if (uop == _POP_FRAME || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
801 TRACE_STACK_POP();
802 /* Set the operand to the function or code object returned to,
803 * to assist optimization passes. (See _PUSH_FRAME below.)
804 */
805 if (func != NULL) {
806 operand = (uintptr_t)func;
807 }
808 else if (code != NULL) {
809 operand = (uintptr_t)code | 1;
810 }
811 else {
812 operand = 0;
813 }
814 ADD_TO_TRACE(uop, oparg, operand, target);
815 DPRINTF(2,
816 "Returning to %s (%s:%d) at byte offset %d\n",
817 PyUnicode_AsUTF8(code->co_qualname),
818 PyUnicode_AsUTF8(code->co_filename),
819 code->co_firstlineno,
820 2 * INSTR_IP(instr, code));
821 goto top;
822 }
823
824 if (uop == _PUSH_FRAME) {
825 assert(i + 1 == nuops);
826 int func_version_offset =
827 offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT)
828 // Add one to account for the actual opcode/oparg pair:
829 + 1;
830 uint32_t func_version = read_u32(&instr[func_version_offset].cache);
831 PyCodeObject *new_code = NULL;
832 PyFunctionObject *new_func =
833 _PyFunction_LookupByVersion(func_version, (PyObject **) &new_code);
834 DPRINTF(2, "Function: version=%#x; new_func=%p, new_code=%p\n",
835 (int)func_version, new_func, new_code);
836 if (new_code != NULL) {
837 if (new_code == code) {
838 // Recursive call, bail (we could be here forever).
839 DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n",
840 PyUnicode_AsUTF8(new_code->co_qualname),
841 PyUnicode_AsUTF8(new_code->co_filename),
842 new_code->co_firstlineno);
843 OPT_STAT_INC(recursive_call);
844 ADD_TO_TRACE(uop, oparg, 0, target);
845 ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0);
846 goto done;
847 }
848 if (new_code->co_version != func_version) {
849 // func.__code__ was updated.
850 // Perhaps it may happen again, so don't bother tracing.
851 // TODO: Reason about this -- is it better to bail or not?
852 DPRINTF(2, "Bailing because co_version != func_version\n");
853 ADD_TO_TRACE(uop, oparg, 0, target);
854 ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0);
855 goto done;
856 }
857 if (opcode == FOR_ITER_GEN) {
858 DPRINTF(2, "Bailing due to dynamic target\n");
859 ADD_TO_TRACE(uop, oparg, 0, target);
860 ADD_TO_TRACE(_DYNAMIC_EXIT, 0, 0, 0);
861 goto done;
862 }
863 // Increment IP to the return address
864 instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1;
865 TRACE_STACK_PUSH();
866 _Py_BloomFilter_Add(dependencies, new_code);
867 /* Set the operand to the callee's function or code object,
868 * to assist optimization passes.
869 * We prefer setting it to the function (for remove_globals())
870 * but if that's not available but the code is available,
871 * use the code, setting the low bit so the optimizer knows.
872 */
873 if (new_func != NULL) {
874 operand = (uintptr_t)new_func;
875 }
876 else if (new_code != NULL) {
877 operand = (uintptr_t)new_code | 1;
878 }
879 else {
880 operand = 0;
881 }
882 ADD_TO_TRACE(uop, oparg, operand, target);
883 code = new_code;
884 func = new_func;
885 instr = _PyCode_CODE(code);
886 DPRINTF(2,
887 "Continuing in %s (%s:%d) at byte offset %d\n",
888 PyUnicode_AsUTF8(code->co_qualname),
889 PyUnicode_AsUTF8(code->co_filename),
890 code->co_firstlineno,
891 2 * INSTR_IP(instr, code));
892 goto top;
893 }
894 DPRINTF(2, "Bail, new_code == NULL\n");
895 ADD_TO_TRACE(uop, oparg, 0, target);
896 ADD_TO_TRACE(_DYNAMIC_EXIT, 0, 0, 0);
897 goto done;
898 }
899
900 // All other instructions
901 ADD_TO_TRACE(uop, oparg, operand, target);
902 }
903 break;
904 }
905 DPRINTF(2, "Unsupported opcode %s\n", _PyOpcode_OpName[opcode]);
906 OPT_UNSUPPORTED_OPCODE(opcode);
907 goto done; // Break out of loop
908 } // End default
909
910 } // End switch (opcode)
911
912 instr++;
913 // Add cache size for opcode
914 instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
915 } // End for (;;)
916
917 done:
918 while (trace_stack_depth > 0) {
919 TRACE_STACK_POP();
920 }
921 assert(code == initial_code);
922 // Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
923 if (progress_needed || trace_length < 5) {
924 OPT_STAT_INC(trace_too_short);
925 DPRINTF(2,
926 "No trace for %s (%s:%d) at byte offset %d (%s)\n",
927 PyUnicode_AsUTF8(code->co_qualname),
928 PyUnicode_AsUTF8(code->co_filename),
929 code->co_firstlineno,
930 2 * INSTR_IP(initial_instr, code),
931 progress_needed ? "no progress" : "too short");
932 return 0;
933 }
934 if (trace[trace_length-1].opcode != _JUMP_TO_TOP) {
935 ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
936 }
937 DPRINTF(1,
938 "Created a proto-trace for %s (%s:%d) at byte offset %d -- length %d\n",
939 PyUnicode_AsUTF8(code->co_qualname),
940 PyUnicode_AsUTF8(code->co_filename),
941 code->co_firstlineno,
942 2 * INSTR_IP(initial_instr, code),
943 trace_length);
944 OPT_HIST(trace_length, trace_length_hist);
945 return trace_length;
946 }
947
948 #undef RESERVE
949 #undef RESERVE_RAW
950 #undef INSTR_IP
951 #undef ADD_TO_TRACE
952 #undef DPRINTF
953
954 #define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
955 #define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
956 #define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
957
958 /* Count the number of unused uops and exits
959 */
960 static int
count_exits(_PyUOpInstruction * buffer,int length)961 count_exits(_PyUOpInstruction *buffer, int length)
962 {
963 int exit_count = 0;
964 for (int i = 0; i < length; i++) {
965 int opcode = buffer[i].opcode;
966 if (opcode == _EXIT_TRACE || opcode == _DYNAMIC_EXIT) {
967 exit_count++;
968 }
969 }
970 return exit_count;
971 }
972
make_exit(_PyUOpInstruction * inst,int opcode,int target)973 static void make_exit(_PyUOpInstruction *inst, int opcode, int target)
974 {
975 inst->opcode = opcode;
976 inst->oparg = 0;
977 inst->operand = 0;
978 inst->format = UOP_FORMAT_TARGET;
979 inst->target = target;
980 }
981
982 /* Convert implicit exits, errors and deopts
983 * into explicit ones. */
984 static int
prepare_for_execution(_PyUOpInstruction * buffer,int length)985 prepare_for_execution(_PyUOpInstruction *buffer, int length)
986 {
987 int32_t current_jump = -1;
988 int32_t current_jump_target = -1;
989 int32_t current_error = -1;
990 int32_t current_error_target = -1;
991 int32_t current_popped = -1;
992 int32_t current_exit_op = -1;
993 /* Leaving in NOPs slows down the interpreter and messes up the stats */
994 _PyUOpInstruction *copy_to = &buffer[0];
995 for (int i = 0; i < length; i++) {
996 _PyUOpInstruction *inst = &buffer[i];
997 if (inst->opcode != _NOP) {
998 if (copy_to != inst) {
999 *copy_to = *inst;
1000 }
1001 copy_to++;
1002 }
1003 }
1004 length = (int)(copy_to - buffer);
1005 int next_spare = length;
1006 for (int i = 0; i < length; i++) {
1007 _PyUOpInstruction *inst = &buffer[i];
1008 int opcode = inst->opcode;
1009 int32_t target = (int32_t)uop_get_target(inst);
1010 if (_PyUop_Flags[opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) {
1011 uint16_t exit_op = (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) ?
1012 _EXIT_TRACE : _DEOPT;
1013 int32_t jump_target = target;
1014 if (is_for_iter_test[opcode]) {
1015 /* Target the POP_TOP immediately after the END_FOR,
1016 * leaving only the iterator on the stack. */
1017 int extended_arg = inst->oparg > 255;
1018 int32_t next_inst = target + 1 + INLINE_CACHE_ENTRIES_FOR_ITER + extended_arg;
1019 jump_target = next_inst + inst->oparg + 1;
1020 }
1021 if (jump_target != current_jump_target || current_exit_op != exit_op) {
1022 make_exit(&buffer[next_spare], exit_op, jump_target);
1023 current_exit_op = exit_op;
1024 current_jump_target = jump_target;
1025 current_jump = next_spare;
1026 next_spare++;
1027 }
1028 buffer[i].jump_target = current_jump;
1029 buffer[i].format = UOP_FORMAT_JUMP;
1030 }
1031 if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
1032 int popped = (_PyUop_Flags[opcode] & HAS_ERROR_NO_POP_FLAG) ?
1033 0 : _PyUop_num_popped(opcode, inst->oparg);
1034 if (target != current_error_target || popped != current_popped) {
1035 current_popped = popped;
1036 current_error = next_spare;
1037 current_error_target = target;
1038 make_exit(&buffer[next_spare], _ERROR_POP_N, 0);
1039 buffer[next_spare].oparg = popped;
1040 buffer[next_spare].operand = target;
1041 next_spare++;
1042 }
1043 buffer[i].error_target = current_error;
1044 if (buffer[i].format == UOP_FORMAT_TARGET) {
1045 buffer[i].format = UOP_FORMAT_JUMP;
1046 buffer[i].jump_target = 0;
1047 }
1048 }
1049 }
1050 return next_spare;
1051 }
1052
1053 /* Executor side exits */
1054
1055 static _PyExecutorObject *
allocate_executor(int exit_count,int length)1056 allocate_executor(int exit_count, int length)
1057 {
1058 int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
1059 _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
1060 if (res == NULL) {
1061 return NULL;
1062 }
1063 res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
1064 res->code_size = length;
1065 res->exit_count = exit_count;
1066 return res;
1067 }
1068
1069 #ifdef Py_DEBUG
1070
1071 #define CHECK(PRED) \
1072 if (!(PRED)) { \
1073 printf(#PRED " at %d\n", i); \
1074 assert(0); \
1075 }
1076
1077 static int
target_unused(int opcode)1078 target_unused(int opcode)
1079 {
1080 return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0;
1081 }
1082
1083 static void
sanity_check(_PyExecutorObject * executor)1084 sanity_check(_PyExecutorObject *executor)
1085 {
1086 for (uint32_t i = 0; i < executor->exit_count; i++) {
1087 _PyExitData *exit = &executor->exits[i];
1088 CHECK(exit->target < (1 << 25));
1089 }
1090 bool ended = false;
1091 uint32_t i = 0;
1092 CHECK(executor->trace[0].opcode == _START_EXECUTOR || executor->trace[0].opcode == _COLD_EXIT);
1093 for (; i < executor->code_size; i++) {
1094 const _PyUOpInstruction *inst = &executor->trace[i];
1095 uint16_t opcode = inst->opcode;
1096 CHECK(opcode <= MAX_UOP_ID);
1097 CHECK(_PyOpcode_uop_name[opcode] != NULL);
1098 switch(inst->format) {
1099 case UOP_FORMAT_TARGET:
1100 CHECK(target_unused(opcode));
1101 break;
1102 case UOP_FORMAT_EXIT:
1103 CHECK(opcode == _EXIT_TRACE);
1104 CHECK(inst->exit_index < executor->exit_count);
1105 break;
1106 case UOP_FORMAT_JUMP:
1107 CHECK(inst->jump_target < executor->code_size);
1108 break;
1109 case UOP_FORMAT_UNUSED:
1110 CHECK(0);
1111 break;
1112 }
1113 if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
1114 CHECK(inst->format == UOP_FORMAT_JUMP);
1115 CHECK(inst->error_target < executor->code_size);
1116 }
1117 if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE || opcode == _COLD_EXIT) {
1118 ended = true;
1119 i++;
1120 break;
1121 }
1122 }
1123 CHECK(ended);
1124 for (; i < executor->code_size; i++) {
1125 const _PyUOpInstruction *inst = &executor->trace[i];
1126 uint16_t opcode = inst->opcode;
1127 CHECK(
1128 opcode == _DEOPT ||
1129 opcode == _EXIT_TRACE ||
1130 opcode == _ERROR_POP_N);
1131 if (opcode == _EXIT_TRACE) {
1132 CHECK(inst->format == UOP_FORMAT_EXIT);
1133 }
1134 }
1135 }
1136
1137 #undef CHECK
1138 #endif
1139
1140 /* Makes an executor from a buffer of uops.
1141 * Account for the buffer having gaps and NOPs by computing a "used"
1142 * bit vector and only copying the used uops. Here "used" means reachable
1143 * and not a NOP.
1144 */
1145 static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction * buffer,int length,const _PyBloomFilter * dependencies)1146 make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies)
1147 {
1148 int exit_count = count_exits(buffer, length);
1149 _PyExecutorObject *executor = allocate_executor(exit_count, length);
1150 if (executor == NULL) {
1151 return NULL;
1152 }
1153
1154 /* Initialize exits */
1155 assert(exit_count < COLD_EXIT_COUNT);
1156 for (int i = 0; i < exit_count; i++) {
1157 executor->exits[i].executor = &COLD_EXITS[i];
1158 executor->exits[i].temperature = initial_temperature_backoff_counter();
1159 }
1160 int next_exit = exit_count-1;
1161 _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
1162 assert(buffer[0].opcode == _START_EXECUTOR);
1163 buffer[0].operand = (uint64_t)executor;
1164 for (int i = length-1; i >= 0; i--) {
1165 int opcode = buffer[i].opcode;
1166 dest--;
1167 *dest = buffer[i];
1168 assert(opcode != _POP_JUMP_IF_FALSE && opcode != _POP_JUMP_IF_TRUE);
1169 if (opcode == _EXIT_TRACE) {
1170 executor->exits[next_exit].target = buffer[i].target;
1171 dest->exit_index = next_exit;
1172 dest->format = UOP_FORMAT_EXIT;
1173 next_exit--;
1174 }
1175 if (opcode == _DYNAMIC_EXIT) {
1176 executor->exits[next_exit].target = 0;
1177 dest->oparg = next_exit;
1178 next_exit--;
1179 }
1180 }
1181 assert(next_exit == -1);
1182 assert(dest == executor->trace);
1183 assert(dest->opcode == _START_EXECUTOR);
1184 _Py_ExecutorInit(executor, dependencies);
1185 #ifdef Py_DEBUG
1186 char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
1187 int lltrace = 0;
1188 if (python_lltrace != NULL && *python_lltrace >= '0') {
1189 lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
1190 }
1191 if (lltrace >= 2) {
1192 printf("Optimized trace (length %d):\n", length);
1193 for (int i = 0; i < length; i++) {
1194 printf("%4d OPTIMIZED: ", i);
1195 _PyUOpPrint(&executor->trace[i]);
1196 printf("\n");
1197 }
1198 }
1199 sanity_check(executor);
1200 #endif
1201 #ifdef _Py_JIT
1202 executor->jit_code = NULL;
1203 executor->jit_side_entry = NULL;
1204 executor->jit_size = 0;
1205 if (_PyJIT_Compile(executor, executor->trace, length)) {
1206 Py_DECREF(executor);
1207 return NULL;
1208 }
1209 #endif
1210 _PyObject_GC_TRACK(executor);
1211 return executor;
1212 }
1213
1214 static int
init_cold_exit_executor(_PyExecutorObject * executor,int oparg)1215 init_cold_exit_executor(_PyExecutorObject *executor, int oparg)
1216 {
1217 _Py_SetImmortalUntracked((PyObject *)executor);
1218 Py_SET_TYPE(executor, &_PyUOpExecutor_Type);
1219 executor->trace = (_PyUOpInstruction *)executor->exits;
1220 executor->code_size = 1;
1221 executor->exit_count = 0;
1222 _PyUOpInstruction *inst = (_PyUOpInstruction *)&executor->trace[0];
1223 inst->opcode = _COLD_EXIT;
1224 inst->oparg = oparg;
1225 executor->vm_data.valid = true;
1226 executor->vm_data.linked = false;
1227 for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
1228 assert(executor->vm_data.bloom.bits[i] == 0);
1229 }
1230 #ifdef Py_DEBUG
1231 sanity_check(executor);
1232 #endif
1233 #ifdef _Py_JIT
1234 executor->jit_code = NULL;
1235 executor->jit_side_entry = NULL;
1236 executor->jit_size = 0;
1237 if (_PyJIT_Compile(executor, executor->trace, 1)) {
1238 return -1;
1239 }
1240 #endif
1241 return 0;
1242 }
1243
1244 #ifdef Py_STATS
1245 /* Returns the effective trace length.
1246 * Ignores NOPs and trailing exit and error handling.*/
effective_trace_length(_PyUOpInstruction * buffer,int length)1247 int effective_trace_length(_PyUOpInstruction *buffer, int length)
1248 {
1249 int nop_count = 0;
1250 for (int i = 0; i < length; i++) {
1251 int opcode = buffer[i].opcode;
1252 if (opcode == _NOP) {
1253 nop_count++;
1254 }
1255 if (opcode == _EXIT_TRACE ||
1256 opcode == _JUMP_TO_TOP ||
1257 opcode == _COLD_EXIT) {
1258 return i+1-nop_count;
1259 }
1260 }
1261 Py_FatalError("No terminating instruction");
1262 Py_UNREACHABLE();
1263 }
1264 #endif
1265
1266 static int
uop_optimize(_PyOptimizerObject * self,_PyInterpreterFrame * frame,_Py_CODEUNIT * instr,_PyExecutorObject ** exec_ptr,int curr_stackentries)1267 uop_optimize(
1268 _PyOptimizerObject *self,
1269 _PyInterpreterFrame *frame,
1270 _Py_CODEUNIT *instr,
1271 _PyExecutorObject **exec_ptr,
1272 int curr_stackentries)
1273 {
1274 _PyBloomFilter dependencies;
1275 _Py_BloomFilter_Init(&dependencies);
1276 _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
1277 OPT_STAT_INC(attempts);
1278 int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
1279 if (length <= 0) {
1280 // Error or nothing translated
1281 return length;
1282 }
1283 assert(length < UOP_MAX_TRACE_LENGTH);
1284 OPT_STAT_INC(traces_created);
1285 char *env_var = Py_GETENV("PYTHON_UOPS_OPTIMIZE");
1286 if (env_var == NULL || *env_var == '\0' || *env_var > '0') {
1287 length = _Py_uop_analyze_and_optimize(frame, buffer,
1288 length,
1289 curr_stackentries, &dependencies);
1290 if (length <= 0) {
1291 return length;
1292 }
1293 }
1294 assert(length < UOP_MAX_TRACE_LENGTH);
1295 assert(length >= 1);
1296 /* Fix up */
1297 for (int pc = 0; pc < length; pc++) {
1298 int opcode = buffer[pc].opcode;
1299 int oparg = buffer[pc].oparg;
1300 if (_PyUop_Flags[opcode] & HAS_OPARG_AND_1_FLAG) {
1301 buffer[pc].opcode = opcode + 1 + (oparg & 1);
1302 }
1303 else if (oparg < _PyUop_Replication[opcode]) {
1304 buffer[pc].opcode = opcode + oparg + 1;
1305 }
1306 else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
1307 break;
1308 }
1309 assert(_PyOpcode_uop_name[buffer[pc].opcode]);
1310 assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0);
1311 }
1312 OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
1313 length = prepare_for_execution(buffer, length);
1314 assert(length <= UOP_MAX_TRACE_LENGTH);
1315 _PyExecutorObject *executor = make_executor_from_uops(buffer, length, &dependencies);
1316 if (executor == NULL) {
1317 return -1;
1318 }
1319 assert(length <= UOP_MAX_TRACE_LENGTH);
1320 *exec_ptr = executor;
1321 return 1;
1322 }
1323
1324 static void
uop_opt_dealloc(PyObject * self)1325 uop_opt_dealloc(PyObject *self) {
1326 PyObject_Free(self);
1327 }
1328
1329 PyTypeObject _PyUOpOptimizer_Type = {
1330 PyVarObject_HEAD_INIT(&PyType_Type, 0)
1331 .tp_name = "uop_optimizer",
1332 .tp_basicsize = sizeof(_PyOptimizerObject),
1333 .tp_itemsize = 0,
1334 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
1335 .tp_dealloc = uop_opt_dealloc,
1336 };
1337
1338 PyObject *
_PyOptimizer_NewUOpOptimizer(void)1339 _PyOptimizer_NewUOpOptimizer(void)
1340 {
1341 _PyOptimizerObject *opt = PyObject_New(_PyOptimizerObject, &_PyUOpOptimizer_Type);
1342 if (opt == NULL) {
1343 return NULL;
1344 }
1345 opt->optimize = uop_optimize;
1346 return (PyObject *)opt;
1347 }
1348
1349 static void
counter_dealloc(_PyExecutorObject * self)1350 counter_dealloc(_PyExecutorObject *self) {
1351 /* The optimizer is the operand of the second uop. */
1352 PyObject *opt = (PyObject *)self->trace[1].operand;
1353 Py_DECREF(opt);
1354 uop_dealloc(self);
1355 }
1356
1357 PyTypeObject _PyCounterExecutor_Type = {
1358 PyVarObject_HEAD_INIT(&PyType_Type, 0)
1359 .tp_name = "counting_executor",
1360 .tp_basicsize = offsetof(_PyExecutorObject, exits),
1361 .tp_itemsize = 1,
1362 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
1363 .tp_dealloc = (destructor)counter_dealloc,
1364 .tp_methods = executor_methods,
1365 .tp_traverse = executor_traverse,
1366 .tp_clear = (inquiry)executor_clear,
1367 };
1368
1369 static int
counter_optimize(_PyOptimizerObject * self,_PyInterpreterFrame * frame,_Py_CODEUNIT * instr,_PyExecutorObject ** exec_ptr,int Py_UNUSED (curr_stackentries))1370 counter_optimize(
1371 _PyOptimizerObject* self,
1372 _PyInterpreterFrame *frame,
1373 _Py_CODEUNIT *instr,
1374 _PyExecutorObject **exec_ptr,
1375 int Py_UNUSED(curr_stackentries)
1376 )
1377 {
1378 PyCodeObject *code = _PyFrame_GetCode(frame);
1379 int oparg = instr->op.arg;
1380 while (instr->op.code == EXTENDED_ARG) {
1381 instr++;
1382 oparg = (oparg << 8) | instr->op.arg;
1383 }
1384 if (instr->op.code != JUMP_BACKWARD) {
1385 /* Counter optimizer can only handle backward edges */
1386 return 0;
1387 }
1388 _Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[JUMP_BACKWARD] - oparg;
1389 _PyUOpInstruction buffer[4] = {
1390 { .opcode = _START_EXECUTOR, .jump_target = 3, .format=UOP_FORMAT_JUMP },
1391 { .opcode = _LOAD_CONST_INLINE_BORROW, .operand = (uintptr_t)self },
1392 { .opcode = _INTERNAL_INCREMENT_OPT_COUNTER },
1393 { .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)), .format=UOP_FORMAT_TARGET }
1394 };
1395 _PyExecutorObject *executor = make_executor_from_uops(buffer, 4, &EMPTY_FILTER);
1396 if (executor == NULL) {
1397 return -1;
1398 }
1399 Py_INCREF(self);
1400 Py_SET_TYPE(executor, &_PyCounterExecutor_Type);
1401 *exec_ptr = executor;
1402 return 1;
1403 }
1404
1405 static PyObject *
counter_get_counter(PyObject * self,PyObject * args)1406 counter_get_counter(PyObject *self, PyObject *args)
1407 {
1408 return PyLong_FromLongLong(((_PyCounterOptimizerObject *)self)->count);
1409 }
1410
1411 static PyMethodDef counter_optimizer_methods[] = {
1412 { "get_count", counter_get_counter, METH_NOARGS, NULL },
1413 { NULL, NULL },
1414 };
1415
1416 PyTypeObject _PyCounterOptimizer_Type = {
1417 PyVarObject_HEAD_INIT(&PyType_Type, 0)
1418 .tp_name = "Counter optimizer",
1419 .tp_basicsize = sizeof(_PyCounterOptimizerObject),
1420 .tp_itemsize = 0,
1421 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
1422 .tp_methods = counter_optimizer_methods,
1423 .tp_dealloc = (destructor)PyObject_Del,
1424 };
1425
1426 PyObject *
_PyOptimizer_NewCounter(void)1427 _PyOptimizer_NewCounter(void)
1428 {
1429 _PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&_PyCounterOptimizer_Type);
1430 if (opt == NULL) {
1431 return NULL;
1432 }
1433 opt->base.optimize = counter_optimize;
1434 opt->count = 0;
1435 return (PyObject *)opt;
1436 }
1437
1438
1439 /*****************************************
1440 * Executor management
1441 ****************************************/
1442
1443 /* We use a bloomfilter with k = 6, m = 256
1444 * The choice of k and the following constants
1445 * could do with a more rigourous analysis,
1446 * but here is a simple analysis:
1447 *
1448 * We want to keep the false positive rate low.
1449 * For n = 5 (a trace depends on 5 objects),
1450 * we expect 30 bits set, giving a false positive
1451 * rate of (30/256)**6 == 2.5e-6 which is plenty
1452 * good enough.
1453 *
1454 * However with n = 10 we expect 60 bits set (worst case),
1455 * giving a false positive of (60/256)**6 == 0.0001
1456 *
1457 * We choose k = 6, rather than a higher number as
1458 * it means the false positive rate grows slower for high n.
1459 *
1460 * n = 5, k = 6 => fp = 2.6e-6
1461 * n = 5, k = 8 => fp = 3.5e-7
1462 * n = 10, k = 6 => fp = 1.6e-4
1463 * n = 10, k = 8 => fp = 0.9e-4
1464 * n = 15, k = 6 => fp = 0.18%
1465 * n = 15, k = 8 => fp = 0.23%
1466 * n = 20, k = 6 => fp = 1.1%
1467 * n = 20, k = 8 => fp = 2.3%
1468 *
1469 * The above analysis assumes perfect hash functions,
1470 * but those don't exist, so the real false positive
1471 * rates may be worse.
1472 */
1473
1474 #define K 6
1475
1476 #define SEED 20221211
1477
1478 /* TO DO -- Use more modern hash functions with better distribution of bits */
1479 static uint64_t
address_to_hash(void * ptr)1480 address_to_hash(void *ptr) {
1481 assert(ptr != NULL);
1482 uint64_t uhash = SEED;
1483 uintptr_t addr = (uintptr_t)ptr;
1484 for (int i = 0; i < SIZEOF_VOID_P; i++) {
1485 uhash ^= addr & 255;
1486 uhash *= (uint64_t)PyHASH_MULTIPLIER;
1487 addr >>= 8;
1488 }
1489 return uhash;
1490 }
1491
1492 void
_Py_BloomFilter_Init(_PyBloomFilter * bloom)1493 _Py_BloomFilter_Init(_PyBloomFilter *bloom)
1494 {
1495 for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
1496 bloom->bits[i] = 0;
1497 }
1498 }
1499
1500 /* We want K hash functions that each set 1 bit.
1501 * A hash function that sets 1 bit in M bits can be trivially
1502 * derived from a log2(M) bit hash function.
1503 * So we extract 8 (log2(256)) bits at a time from
1504 * the 64bit hash. */
1505 void
_Py_BloomFilter_Add(_PyBloomFilter * bloom,void * ptr)1506 _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr)
1507 {
1508 uint64_t hash = address_to_hash(ptr);
1509 assert(K <= 8);
1510 for (int i = 0; i < K; i++) {
1511 uint8_t bits = hash & 255;
1512 bloom->bits[bits >> 5] |= (1 << (bits&31));
1513 hash >>= 8;
1514 }
1515 }
1516
1517 static bool
bloom_filter_may_contain(_PyBloomFilter * bloom,_PyBloomFilter * hashes)1518 bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
1519 {
1520 for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
1521 if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) {
1522 return false;
1523 }
1524 }
1525 return true;
1526 }
1527
1528 static void
link_executor(_PyExecutorObject * executor)1529 link_executor(_PyExecutorObject *executor)
1530 {
1531 PyInterpreterState *interp = _PyInterpreterState_GET();
1532 _PyExecutorLinkListNode *links = &executor->vm_data.links;
1533 _PyExecutorObject *head = interp->executor_list_head;
1534 if (head == NULL) {
1535 interp->executor_list_head = executor;
1536 links->previous = NULL;
1537 links->next = NULL;
1538 }
1539 else {
1540 assert(head->vm_data.links.previous == NULL);
1541 links->previous = NULL;
1542 links->next = head;
1543 head->vm_data.links.previous = executor;
1544 interp->executor_list_head = executor;
1545 }
1546 executor->vm_data.linked = true;
1547 /* executor_list_head must be first in list */
1548 assert(interp->executor_list_head->vm_data.links.previous == NULL);
1549 }
1550
1551 static void
unlink_executor(_PyExecutorObject * executor)1552 unlink_executor(_PyExecutorObject *executor)
1553 {
1554 if (!executor->vm_data.linked) {
1555 return;
1556 }
1557 _PyExecutorLinkListNode *links = &executor->vm_data.links;
1558 assert(executor->vm_data.valid);
1559 _PyExecutorObject *next = links->next;
1560 _PyExecutorObject *prev = links->previous;
1561 if (next != NULL) {
1562 next->vm_data.links.previous = prev;
1563 }
1564 if (prev != NULL) {
1565 prev->vm_data.links.next = next;
1566 }
1567 else {
1568 // prev == NULL implies that executor is the list head
1569 PyInterpreterState *interp = PyInterpreterState_Get();
1570 assert(interp->executor_list_head == executor);
1571 interp->executor_list_head = next;
1572 }
1573 executor->vm_data.linked = false;
1574 }
1575
1576 /* This must be called by optimizers before using the executor */
1577 void
_Py_ExecutorInit(_PyExecutorObject * executor,const _PyBloomFilter * dependency_set)1578 _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
1579 {
1580 executor->vm_data.valid = true;
1581 for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
1582 executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
1583 }
1584 link_executor(executor);
1585 }
1586
1587 /* Detaches the executor from the code object (if any) that
1588 * holds a reference to it */
1589 void
_Py_ExecutorDetach(_PyExecutorObject * executor)1590 _Py_ExecutorDetach(_PyExecutorObject *executor)
1591 {
1592 PyCodeObject *code = executor->vm_data.code;
1593 if (code == NULL) {
1594 return;
1595 }
1596 _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
1597 assert(instruction->op.code == ENTER_EXECUTOR);
1598 int index = instruction->op.arg;
1599 assert(code->co_executors->executors[index] == executor);
1600 instruction->op.code = executor->vm_data.opcode;
1601 instruction->op.arg = executor->vm_data.oparg;
1602 executor->vm_data.code = NULL;
1603 code->co_executors->executors[index] = NULL;
1604 Py_DECREF(executor);
1605 }
1606
1607 static int
executor_clear(_PyExecutorObject * executor)1608 executor_clear(_PyExecutorObject *executor)
1609 {
1610 if (!executor->vm_data.valid) {
1611 return 0;
1612 }
1613 assert(executor->vm_data.valid == 1);
1614 unlink_executor(executor);
1615 executor->vm_data.valid = 0;
1616 /* It is possible for an executor to form a reference
1617 * cycle with itself, so decref'ing a side exit could
1618 * free the executor unless we hold a strong reference to it
1619 */
1620 Py_INCREF(executor);
1621 for (uint32_t i = 0; i < executor->exit_count; i++) {
1622 const _PyExecutorObject *cold = &COLD_EXITS[i];
1623 const _PyExecutorObject *side = executor->exits[i].executor;
1624 executor->exits[i].temperature = initial_unreachable_backoff_counter();
1625 if (side != cold) {
1626 executor->exits[i].executor = cold;
1627 Py_DECREF(side);
1628 }
1629 }
1630 _Py_ExecutorDetach(executor);
1631 Py_DECREF(executor);
1632 return 0;
1633 }
1634
1635 void
_Py_Executor_DependsOn(_PyExecutorObject * executor,void * obj)1636 _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
1637 {
1638 assert(executor->vm_data.valid);
1639 _Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
1640 }
1641
1642 /* Invalidate all executors that depend on `obj`
1643 * May cause other executors to be invalidated as well
1644 */
1645 void
_Py_Executors_InvalidateDependency(PyInterpreterState * interp,void * obj,int is_invalidation)1646 _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
1647 {
1648 _PyBloomFilter obj_filter;
1649 _Py_BloomFilter_Init(&obj_filter);
1650 _Py_BloomFilter_Add(&obj_filter, obj);
1651 /* Walk the list of executors */
1652 /* TO DO -- Use a tree to avoid traversing as many objects */
1653 bool no_memory = false;
1654 PyObject *invalidate = PyList_New(0);
1655 if (invalidate == NULL) {
1656 PyErr_Clear();
1657 no_memory = true;
1658 }
1659 /* Clearing an executor can deallocate others, so we need to make a list of
1660 * executors to invalidate first */
1661 for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1662 assert(exec->vm_data.valid);
1663 _PyExecutorObject *next = exec->vm_data.links.next;
1664 if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) {
1665 unlink_executor(exec);
1666 if (no_memory) {
1667 exec->vm_data.valid = 0;
1668 } else {
1669 if (PyList_Append(invalidate, (PyObject *)exec) < 0) {
1670 PyErr_Clear();
1671 no_memory = true;
1672 exec->vm_data.valid = 0;
1673 }
1674 }
1675 if (is_invalidation) {
1676 OPT_STAT_INC(executors_invalidated);
1677 }
1678 }
1679 exec = next;
1680 }
1681 if (invalidate != NULL) {
1682 for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
1683 _PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i);
1684 executor_clear(exec);
1685 }
1686 Py_DECREF(invalidate);
1687 }
1688 return;
1689 }
1690
1691 /* Invalidate all executors */
1692 void
_Py_Executors_InvalidateAll(PyInterpreterState * interp,int is_invalidation)1693 _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
1694 {
1695 while (interp->executor_list_head) {
1696 _PyExecutorObject *executor = interp->executor_list_head;
1697 assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1);
1698 if (executor->vm_data.code) {
1699 // Clear the entire code object so its co_executors array be freed:
1700 _PyCode_Clear_Executors(executor->vm_data.code);
1701 }
1702 else {
1703 executor_clear(executor);
1704 }
1705 if (is_invalidation) {
1706 OPT_STAT_INC(executors_invalidated);
1707 }
1708 }
1709 }
1710
1711 #endif /* _Py_TIER2 */
1712