1 /*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include <limits.h>
26
27 #include "gpir.h"
28
29 /*
30 * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
31 *
32 * The GP pipeline has three main stages:
33 *
34 * --------------------------------------------------------
35 * | |
36 * | Register/Attr/Temp Fetch |
37 * | |
38 * --------------------------------------------------------
39 * | | | | | | |
40 * | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass |
41 * | | | | | | |
42 * --------------------------------------------------------
43 * | | | |
44 * | Complex1 | Temp/Register/Varying | Pass |
45 * | Stage 2 | Store | Stage 2 |
46 * | | | |
47 * --------------------------------------------------------
48 *
49 * Because of this setup, storing a register has a latency of three cycles.
50 * Also, the register file is organized into 4-component vectors, and the
51 * load stage can only load two vectors at a time. Aside from these highly
52 * constrained register load/store units, there is an explicit bypass
53 * network, where each unit (mul0/mul1/etc.) can access the results of the
54 * any unit from the previous two cycles directly, except for the complex
55 * unit whose result can only be accessed for one cycle (since it's expected
56 * to be used directly by the complex2 instruction in the following cycle).
57 *
58 * Because of the very restricted register file, and because only rarely are
59 * all the units in use at the same time, it can be very beneficial to use
60 * the unused units to "thread" a value from source to destination by using
61 * moves in the otherwise-unused units, without involving the register file
62 * at all. It's very difficult to fully exploit this with a traditional
63 * scheduler, so we need to do something a little un-traditional. The 512
64 * instruction limit means that for more complex shaders, we need to do as
65 * well as possible or else the app won't even work.
66 *
67 * The scheduler works by considering the bypass network as a kind of
68 * register file. It's a quite unusual register file, since registers have to
69 * be assigned "on the fly" as we schedule operations, but with some care, we
70 * can use something conceptually similar to a linear-scan allocator to
71 * successfully schedule nodes to instructions without running into
72 * conflicts.
73 *
74 * Values in the IR are separated into normal values, or "value registers",
75 * which is what normal nodes like add, mul, etc. produce, and which only
76 * live inside one basic block, and registers, which can span multiple basic
77 * blocks but have to be accessed via special load_reg/store_reg nodes. RA
78 * assigns physical registers to both value registers and normal registers,
79 * treating load_reg/store_reg as a move instruction, but these are only used
80 * directly for normal registers -- the physreg assigned to a value register
81 * is "fake," and is only used inside the scheduler. Before scheduling we
82 * insert read-after-write dependencies, even for value registers, as if
83 * we're going to use those, but then we throw them away. For example, if we
84 * had something like:
85 *
86 * (*)r2 = add (*)r1, (*)r2
87 * (*)r1 = load_reg r0
88 *
89 * we'd insert a write-after-read dependency between the add and load_reg,
90 * even though the starred registers aren't actually used by the scheduler
91 * after this step. This step is crucial since it guarantees that during any
92 * point in the schedule, the number of live registers + live value registers
93 * will never exceed the capacity of the register file and the bypass network
94 * combined. This is because each live register/value register will have a
95 * different fake number, thanks to the fake dependencies inserted before
96 * scheduling. This allows us to not have to worry about spilling to
97 * temporaries, which is only done ahead of time.
98 *
99 * The scheduler is a bottom-up scheduler. It keeps track of each live value
100 * register, and decides on-the-fly which value registers to keep in the
101 * bypass network and which to "spill" to registers. Of particular importance
102 * is the "ready list," which consists of "input nodes" (nodes that produce a
103 * value that can be consumed via the bypass network), both "partially ready"
104 * (only some of the uses have been scheduled) and "fully ready" (all uses
105 * have been scheduled), as well as other non-input nodes like register
106 * stores. Each input node on the ready list represents a live value register
107 * before the current instruction. There must be at most 11 such input nodes
108 * at all times, since there are only 11 slots in the next two instructions
109 * which can reach the current instruction.
110 *
111 * An input node is a "max node" if it has a use two cycles ago, which must be
112 * connected to a definition this cycle. Otherwise it may be a "next max node"
113 * if it will be a max node on the next instruction (i.e. it has a use at most
114 * one cycle ago), or it may be neither if all of its uses are this cycle. As
115 * we keep adding instructions to the front, input nodes graduate from
116 * neither, to next max, to max, unless we decide to insert a move to keep it
117 * alive longer, at which point any uses after the current instruction are
118 * rewritten to be uses of the move so that the original node returns to
119 * neither. The scheduler decides which nodes to try freely, but we have to
120 * reserve slots for two different reasons: (1) out of the 5 non-complex
121 * slots, we reserve a slot for each max node, so that we can connect a
122 * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
123 * slot for every next-max node above 5, so that for the next instruction
124 * there are no more than 5 max nodes. When a max or next-max node gets
125 * scheduled, the corresponding reservation is reduced by one. At the end, we
126 * insert moves for every slot that was reserved. The reservation is actually
127 * managed by nir_instr, and all we have to do is tell it how many to reserve
128 * at the beginning and then tell it which nodes are max/next-max nodes. When
129 * we start scheduling an instruction, there will be at most 5 max nodes
130 * thanks to the previous instruction's next-max reservation/move insertion.
131 * Since there are at most 11 total input nodes, if there are N max nodes,
132 * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
133 * 6 - N slots need to be reserved for next-max nodes, and so at most
134 * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
135 * of slots. So, thanks to the total input node restriction, we will never
136 * need to reserve too many slots.
137 *
138 * It sometimes happens that scheduling a given node will violate this total
139 * input node restriction, or that a reservation will mean that we can't
140 * schedule it. We first schedule a node "speculatively" to see if this is a
141 * problem. If some of the node's sources are loads, then we can schedule
142 * the node and its dependent loads in one swoop to avoid going over the
143 * pressure limit. If that fails, we can try to spill a ready or
144 * partially-ready input node to a register by rewriting all of its uses to
145 * refer to a register load. This removes it from the list of ready and
146 * partially ready input nodes as all of its uses are now unscheduled. If
147 * successful, we can then proceed with scheduling the original node. All of
148 * this happens "speculatively," meaning that afterwards the node is removed
149 * and the entire state of the scheduler is reverted to before it was tried, to
150 * ensure that we never get into an invalid state and run out of spots for
151 * moves. In try_nodes(), we try to schedule each node speculatively on the
152 * ready list, keeping only the nodes that could be successfully scheduled, so
153 * that when we finally decide which node to actually schedule, we know it
154 * will succeed. This is how we decide on the fly which values go in
155 * registers and which go in the bypass network. Note that "unspilling" a
156 * value is simply a matter of scheduling the store_reg instruction created
157 * when we spill.
158 *
159 * The careful accounting of live value registers, reservations for moves, and
160 * speculative scheduling guarantee that we never run into a failure case
161 * while scheduling. However, we need to make sure that this scheduler will
162 * not get stuck in an infinite loop, i.e. that we'll always make forward
163 * progress by eventually scheduling a non-move node. If we run out of value
164 * registers, then we may have to spill a node to a register. If we
165 * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
166 * value registers before the current instruction. But since there are at most
167 * 64+11 live registers and register values total thanks to the fake
168 * dependencies we inserted before scheduling, there are at most 64 - N live
169 * physical registers, and therefore there are at least N registers available
170 * for spilling. Not all these registers will be available immediately, since
171 * in order to spill a node to a given register we have to ensure that there
172 * are slots available to rewrite every use to a load instruction, and that
173 * may not be the case. There may also be intervening writes which prevent
174 * some registers from being used. However, these are all temporary problems,
175 * since as we create each instruction, we create additional register load
176 * slots that can be freely used for spilling, and we create more move nodes
177 * which means that the uses of the nodes we're trying to spill keep moving
178 * forward. This means that eventually, these problems will go away, at which
179 * point we'll be able to spill a node successfully, so eventually we'll be
180 * able to schedule the first node on the ready list.
181 */
182
183 typedef struct {
184 /* This is the list of ready and partially-ready nodes. A partially-ready
185 * node must have at least one input dependency already scheduled.
186 */
187 struct list_head ready_list;
188
189 /* The number of ready or partially-ready nodes with at least one input
190 * dependency already scheduled. In other words, the number of live value
191 * registers. This must be at most 11.
192 */
193 int ready_list_slots;
194
195 /* The physical registers live into the current instruction. */
196 uint64_t live_physregs;
197
198 /* The current instruction. */
199 gpir_instr *instr;
200
201 /* The current basic block. */
202 gpir_block *block;
203
204 /* True if at least one node failed to schedule due to lack of available
205 * value registers.
206 */
207 bool try_spill_all;
208
209 /* The number of max nodes needed to spill to successfully schedule the
210 * instruction.
211 */
212 int max_node_spill_needed;
213
214 /* The number of max and next-max nodes needed to spill to successfully
215 * schedule the instruction.
216 */
217 int total_spill_needed;
218
219 /* For each physical register, a linked list of loads associated with it in
220 * this block. When we spill a value to a given register, and there are
221 * existing loads associated with it that haven't been scheduled yet, we
222 * have to make sure that the corresponding unspill happens after the last
223 * original use has happened, i.e. is scheduled before.
224 */
225 struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
226 } sched_ctx;
227
gpir_min_dist_alu(gpir_dep * dep)228 static int gpir_min_dist_alu(gpir_dep *dep)
229 {
230 switch (dep->pred->op) {
231 case gpir_op_load_uniform:
232 case gpir_op_load_temp:
233 case gpir_op_load_reg:
234 case gpir_op_load_attribute:
235 return 0;
236
237 case gpir_op_complex1:
238 return 2;
239
240 default:
241 return 1;
242 }
243 }
244
gpir_get_min_dist(gpir_dep * dep)245 static int gpir_get_min_dist(gpir_dep *dep)
246 {
247 switch (dep->type) {
248 case GPIR_DEP_INPUT:
249 switch (dep->succ->op) {
250 case gpir_op_store_temp:
251 case gpir_op_store_reg:
252 case gpir_op_store_varying:
253 /* Stores must use an alu node as input. Also, complex1 takes two
254 * cycles, which means that its result cannot be stored to a register
255 * as part of the normal path, and therefore it must also have a move
256 * inserted.
257 */
258 if (dep->pred->type == gpir_node_type_load ||
259 dep->pred->op == gpir_op_complex1)
260 return INT_MAX >> 2;
261 else
262 return 0;
263
264 default:
265 return gpir_min_dist_alu(dep);
266 }
267
268 case GPIR_DEP_OFFSET:
269 assert(dep->succ->op == gpir_op_store_temp);
270 return gpir_min_dist_alu(dep);
271
272 case GPIR_DEP_READ_AFTER_WRITE:
273 if (dep->succ->op == gpir_op_load_temp &&
274 dep->pred->op == gpir_op_store_temp) {
275 return 4;
276 } else if (dep->succ->op == gpir_op_load_reg &&
277 dep->pred->op == gpir_op_store_reg) {
278 return 3;
279 } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
280 dep->pred->op == gpir_op_store_temp_load_off1 ||
281 dep->pred->op == gpir_op_store_temp_load_off2) &&
282 dep->succ->op == gpir_op_load_uniform) {
283 return 4;
284 } else {
285 /* Fake dependency */
286 return 0;
287 }
288
289 case GPIR_DEP_WRITE_AFTER_READ:
290 return 0;
291 }
292
293 return 0;
294 }
295
gpir_max_dist_alu(gpir_dep * dep)296 static int gpir_max_dist_alu(gpir_dep *dep)
297 {
298 switch (dep->pred->op) {
299 case gpir_op_load_uniform:
300 case gpir_op_load_temp:
301 return 0;
302 case gpir_op_load_attribute:
303 return 1;
304 case gpir_op_load_reg:
305 if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
306 dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
307 return 0;
308 else
309 return 1;
310 case gpir_op_exp2_impl:
311 case gpir_op_log2_impl:
312 case gpir_op_rcp_impl:
313 case gpir_op_rsqrt_impl:
314 case gpir_op_store_temp_load_off0:
315 case gpir_op_store_temp_load_off1:
316 case gpir_op_store_temp_load_off2:
317 return 1;
318 case gpir_op_mov:
319 if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
320 return 1;
321 else
322 return 2;
323 default:
324 return 2;
325 }
326 }
327
gpir_get_max_dist(gpir_dep * dep)328 static int gpir_get_max_dist(gpir_dep *dep)
329 {
330 switch (dep->type) {
331 case GPIR_DEP_INPUT:
332 switch (dep->succ->op) {
333 case gpir_op_store_temp:
334 case gpir_op_store_reg:
335 case gpir_op_store_varying:
336 return 0;
337
338 default:
339 return gpir_max_dist_alu(dep);
340 }
341
342 case GPIR_DEP_OFFSET:
343 assert(dep->succ->op == gpir_op_store_temp);
344 return gpir_max_dist_alu(dep);
345
346 default:
347 return INT_MAX >> 2; /* Don't want to overflow... */
348 }
349 }
350
schedule_update_distance(gpir_node * node)351 static void schedule_update_distance(gpir_node *node)
352 {
353 if (gpir_node_is_leaf(node)) {
354 node->sched.dist = 0;
355 return;
356 }
357
358 gpir_node_foreach_pred(node, dep) {
359 gpir_node *pred = dep->pred;
360
361 if (pred->sched.dist < 0)
362 schedule_update_distance(pred);
363
364 int dist = pred->sched.dist + gpir_min_dist_alu(dep);
365 if (node->sched.dist < dist)
366 node->sched.dist = dist;
367 }
368 }
369
gpir_is_input_node(gpir_node * node)370 static bool gpir_is_input_node(gpir_node *node)
371 {
372 gpir_node_foreach_succ(node, dep) {
373 if (dep->type == GPIR_DEP_INPUT)
374 return true;
375 }
376 return false;
377 }
378
379
380 /* Get the number of slots required for a node on the ready list.
381 */
gpir_get_slots_required(gpir_node * node)382 static int gpir_get_slots_required(gpir_node *node)
383 {
384 if (!gpir_is_input_node(node))
385 return 0;
386
387 /* Note that we assume every node only consumes one slot, even dual-slot
388 * instructions. While dual-slot instructions may consume more than one
389 * slot, we can always safely insert a move if it turns out that there
390 * isn't enough space for them. There's the risk that we get stuck in an
391 * infinite loop if all the fully ready nodes are dual-slot nodes, but we
392 * rely on spilling to registers to save us here.
393 */
394 return 1;
395 }
396
verify_ready_list(sched_ctx * ctx)397 static void ASSERTED verify_ready_list(sched_ctx *ctx)
398 {
399 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
400 if (!gpir_is_input_node(node)) {
401 assert(node->sched.ready);
402 }
403
404 if (node->sched.ready) {
405 /* Every successor must have been scheduled */
406 gpir_node_foreach_succ(node, dep) {
407 assert(dep->succ->sched.instr);
408 }
409 } else {
410 /* There must be at least one successor that's not scheduled. */
411 bool unscheduled = false;
412 gpir_node_foreach_succ(node, dep) {
413 unscheduled |= !(dep->succ->sched.instr);
414 }
415
416 assert(unscheduled);
417 }
418 }
419 }
420
schedule_insert_ready_list(sched_ctx * ctx,gpir_node * insert_node)421 static void schedule_insert_ready_list(sched_ctx *ctx,
422 gpir_node *insert_node)
423 {
424 /* if this node is fully ready or partially ready
425 * fully ready: all successors have been scheduled
426 * partially ready: part of input successors have been scheduled
427 *
428 * either fully ready or partially ready node need be inserted to
429 * the ready list, but we only schedule a move node for partially
430 * ready node.
431 */
432 bool ready = true, insert = false;
433 gpir_node_foreach_succ(insert_node, dep) {
434 gpir_node *succ = dep->succ;
435 if (succ->sched.instr) {
436 if (dep->type == GPIR_DEP_INPUT)
437 insert = true;
438 }
439 else
440 ready = false;
441 }
442
443 insert_node->sched.ready = ready;
444 /* for root node */
445 insert |= ready;
446
447 if (!insert || insert_node->sched.inserted)
448 return;
449
450 struct list_head *insert_pos = &ctx->ready_list;
451 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
452 if ((insert_node->sched.dist > node->sched.dist ||
453 gpir_op_infos[insert_node->op].schedule_first) &&
454 !gpir_op_infos[node->op].schedule_first) {
455 insert_pos = &node->list;
456 break;
457 }
458 }
459
460 list_addtail(&insert_node->list, insert_pos);
461 insert_node->sched.inserted = true;
462 ctx->ready_list_slots += gpir_get_slots_required(insert_node);
463 }
464
gpir_get_max_start(gpir_node * node)465 static int gpir_get_max_start(gpir_node *node)
466 {
467 int max_start = 0;
468
469 /* find the max start instr constrainted by all successors */
470 gpir_node_foreach_succ(node, dep) {
471 gpir_node *succ = dep->succ;
472 if (!succ->sched.instr)
473 continue;
474
475 int start = succ->sched.instr->index + gpir_get_min_dist(dep);
476 if (start > max_start)
477 max_start = start;
478 }
479
480 return max_start;
481 }
482
gpir_get_min_end(gpir_node * node)483 static int gpir_get_min_end(gpir_node *node)
484 {
485 int min_end = INT_MAX;
486
487 /* find the min end instr constrainted by all successors */
488 gpir_node_foreach_succ(node, dep) {
489 gpir_node *succ = dep->succ;
490 if (!succ->sched.instr)
491 continue;
492
493 int end = succ->sched.instr->index + gpir_get_max_dist(dep);
494 if (end < min_end)
495 min_end = end;
496 }
497
498 return min_end;
499 }
500
gpir_sched_instr_has_load(gpir_instr * instr,gpir_node * node)501 static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
502 {
503 gpir_load_node *load = gpir_node_to_load(node);
504
505 for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
506 if (!instr->slots[i])
507 continue;
508
509 gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
510 if (load->node.op == iload->node.op &&
511 load->index == iload->index &&
512 load->component == iload->component)
513 return &iload->node;
514 }
515 return NULL;
516 }
517
518 /* Simply place the node into the given instruction without trying to deal
519 * with liveness or the ready list. This will only fail if the instruction
520 * cannot be placed due to a lack of available slots. In addition to normal
521 * node placement, this is also used for placing loads when spilling to
522 * registers.
523 */
_try_place_node(sched_ctx * ctx,gpir_instr * instr,gpir_node * node)524 static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
525 {
526 if (node->type == gpir_node_type_load) {
527 gpir_node *load = gpir_sched_instr_has_load(instr, node);
528 if (load) {
529 /* This node may have a store as a successor, in which case we have to
530 * fail it exactly like below in order to later create a move node in
531 * between.
532 */
533 if (instr->index < gpir_get_max_start(node))
534 return false;
535
536 gpir_debug("same load %d in instr %d for node %d\n",
537 load->index, instr->index, node->index);
538
539 /* not really merge two node, just fake scheduled same place */
540 node->sched.instr = load->sched.instr;
541 node->sched.pos = load->sched.pos;
542 return true;
543 }
544 }
545
546 if (node->op == gpir_op_store_reg) {
547 /* This register may be loaded in the next basic block, in which case
548 * there still needs to be a 2 instruction gap. We do what the blob
549 * seems to do and simply disable stores in the last two instructions of
550 * the basic block.
551 *
552 * TODO: We may be able to do better than this, but we have to check
553 * first if storing a register works across branches.
554 */
555 if (instr->index < 2)
556 return false;
557 }
558
559 node->sched.instr = instr;
560
561 int max_node_spill_needed = INT_MAX;
562 int total_spill_needed = INT_MAX;
563 int *slots = gpir_op_infos[node->op].slots;
564 for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
565 node->sched.pos = slots[i];
566 if (instr->index >= gpir_get_max_start(node) &&
567 instr->index <= gpir_get_min_end(node) &&
568 gpir_instr_try_insert_node(instr, node))
569 return true;
570 if (ctx->instr->non_cplx_slot_difference ||
571 ctx->instr->slot_difference) {
572 /* If one of these fields is non-zero, then we could insert the node
573 * here after spilling. To get an accurate count of how many nodes we
574 * need to spill, we need to choose one of the positions where there
575 * were nonzero slot differences, preferably one with the smallest
576 * difference (so we don't have to spill as much).
577 */
578 if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
579 ctx->instr->slot_difference < total_spill_needed) {
580 max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
581 total_spill_needed = ctx->instr->slot_difference;
582 }
583 }
584 }
585
586 if (max_node_spill_needed != INT_MAX) {
587 /* Indicate how many spill nodes are needed. */
588 ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
589 max_node_spill_needed);
590 ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
591 total_spill_needed);
592 }
593 node->sched.instr = NULL;
594 node->sched.pos = -1;
595 return false;
596 }
597
598 /* Try to place just the node given, updating the ready list. If "speculative"
599 * is true, then this is part ofthe pre-commit phase. If false, then we have
600 * committed to placing this node, so update liveness and ready list
601 * information.
602 */
603
schedule_try_place_node(sched_ctx * ctx,gpir_node * node,bool speculative)604 static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
605 bool speculative)
606 {
607 if (!_try_place_node(ctx, ctx->instr, node)) {
608 if (!speculative)
609 gpir_debug("failed to place %d\n", node->index);
610 return false;
611 }
612
613 ctx->ready_list_slots -= gpir_get_slots_required(node);
614
615 if (!speculative) {
616 gpir_debug("placed node %d\n", node->index);
617
618 /* We assume here that writes are placed before reads. If this changes,
619 * then this needs to be updated.
620 */
621 if (node->op == gpir_op_store_reg) {
622 gpir_store_node *store = gpir_node_to_store(node);
623 ctx->live_physregs &=
624 ~(1ull << (4 * store->index + store->component));
625 if (store->child->sched.physreg_store == store)
626 store->child->sched.physreg_store = NULL;
627 }
628
629 if (node->op == gpir_op_load_reg) {
630 gpir_load_node *load = gpir_node_to_load(node);
631 ctx->live_physregs |=
632 (1ull << (4 * load->index + load->component));
633 }
634
635 list_del(&node->list);
636 list_add(&node->list, &ctx->block->node_list);
637 gpir_node_foreach_pred(node, dep) {
638 gpir_node *pred = dep->pred;
639 schedule_insert_ready_list(ctx, pred);
640 }
641 } else {
642 gpir_node_foreach_pred(node, dep) {
643 gpir_node *pred = dep->pred;
644 if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
645 ctx->ready_list_slots += gpir_get_slots_required(pred);
646 }
647 }
648
649 return true;
650 }
651
652 /* Create a new node with "node" as the child, replace all uses of "node" with
653 * this new node, and replace "node" with it in the ready list.
654 */
create_replacement(sched_ctx * ctx,gpir_node * node,gpir_op op)655 static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
656 gpir_op op)
657 {
658
659 gpir_alu_node *new_node = gpir_node_create(node->block, op);
660 if (unlikely(!new_node))
661 return NULL;
662
663 new_node->children[0] = node;
664 new_node->num_child = 1;
665
666 new_node->node.sched.instr = NULL;
667 new_node->node.sched.pos = -1;
668 new_node->node.sched.dist = node->sched.dist;
669 new_node->node.sched.max_node = node->sched.max_node;
670 new_node->node.sched.next_max_node = node->sched.next_max_node;
671 new_node->node.sched.complex_allowed = node->sched.complex_allowed;
672
673 ctx->ready_list_slots--;
674 list_del(&node->list);
675 node->sched.max_node = false;
676 node->sched.next_max_node = false;
677 node->sched.ready = false;
678 node->sched.inserted = false;
679 gpir_node_replace_succ(&new_node->node, node);
680 gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
681 schedule_insert_ready_list(ctx, &new_node->node);
682 return &new_node->node;
683 }
684
create_move(sched_ctx * ctx,gpir_node * node)685 static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
686 {
687 gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
688 gpir_debug("create move %d for %d\n", move->index, node->index);
689 return move;
690 }
691
create_postlog2(sched_ctx * ctx,gpir_node * node)692 static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
693 {
694 assert(node->op == gpir_op_complex1);
695 gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
696 gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
697 return postlog2;
698 }
699
700 /* Once we schedule the successor, would the predecessor be fully ready? */
pred_almost_ready(gpir_dep * dep)701 static bool pred_almost_ready(gpir_dep *dep)
702 {
703 bool fully_ready = true;
704 gpir_node_foreach_succ(dep->pred, other_dep) {
705 gpir_node *succ = other_dep->succ;
706 if (!succ->sched.instr && dep->succ != other_dep->succ) {
707 fully_ready = false;
708 break;
709 }
710 }
711
712 return fully_ready;
713 }
714
715 /* Recursively try to schedule a node and all its dependent nodes that can fit
716 * in the same instruction. There is a simple heuristic scoring system to try
717 * to group together nodes that load different components of the same input,
718 * to avoid bottlenecking for operations like matrix multiplies that are
719 * mostly input-bound.
720 */
721
_schedule_try_node(sched_ctx * ctx,gpir_node * node,bool speculative)722 static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
723 {
724 if (!schedule_try_place_node(ctx, node, speculative))
725 return INT_MIN;
726
727 int score = 0;
728
729 gpir_node_foreach_pred(node, dep) {
730 if (dep->type != GPIR_DEP_INPUT)
731 continue;
732
733 int pred_score = INT_MIN;
734 if (pred_almost_ready(dep)) {
735 if (dep->pred->type == gpir_node_type_load ||
736 node->type == gpir_node_type_store) {
737 pred_score = _schedule_try_node(ctx, dep->pred, speculative);
738 }
739 }
740 if (dep->pred->type == gpir_node_type_load ||
741 node->type == gpir_node_type_store) {
742 if (pred_score == INT_MIN) {
743 if (node->op == gpir_op_mov) {
744 /* The only moves on the ready list are for loads that we
745 * couldn't schedule immediately, as created below. If we
746 * couldn't schedule the load, there's no point scheduling
747 * the move. The normal move threading logic will ensure
748 * that another move is created if we're about to go too far
749 * from the uses of this move.
750 */
751 assert(speculative);
752 return INT_MIN;
753 } else if (!speculative && dep->pred->type == gpir_node_type_load) {
754 /* We couldn't schedule the load right away, so it will have
755 * to happen in some earlier instruction and then be moved
756 * into a value register and threaded to the use by "node".
757 * We create the move right away, so that later we'll fail
758 * to schedule it if there isn't a slot for a move
759 * available.
760 */
761 create_move(ctx, dep->pred);
762 }
763 /* Penalize nodes whose dependent ops we couldn't schedule.
764 */
765 score--;
766 } else {
767 score += pred_score;
768 continue;
769 }
770 }
771 }
772
773 return score;
774 }
775
776 /* If we speculatively tried a node, undo everything.
777 */
778
schedule_undo_node(sched_ctx * ctx,gpir_node * node)779 static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
780 {
781 gpir_instr_remove_node(ctx->instr, node);
782
783 gpir_node_foreach_pred(node, dep) {
784 gpir_node *pred = dep->pred;
785 if (pred->sched.instr) {
786 schedule_undo_node(ctx, pred);
787 }
788 }
789 }
790
791 /* Try to schedule a node. We also try to schedule any predecessors that can
792 * be part of the same instruction. If "speculative" is true, then we don't
793 * actually change any state, only returning the score were the node to be
794 * scheduled, with INT_MIN meaning "cannot be scheduled at all".
795 */
schedule_try_node(sched_ctx * ctx,gpir_node * node,bool speculative)796 static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
797 {
798 int prev_slots = ctx->ready_list_slots;
799
800 int score = _schedule_try_node(ctx, node, speculative);
801
802 if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
803 assert(speculative);
804 ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
805 ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
806 score = INT_MIN;
807 }
808
809 if (speculative) {
810 ctx->ready_list_slots = prev_slots;
811 if (node->sched.instr)
812 schedule_undo_node(ctx, node);
813 }
814
815 return score;
816 }
817
818 /* This is called when we want to spill "node" by inserting loads at its uses.
819 * It returns all the possible registers we can use so that all the loads will
820 * successfully be inserted. Also return the first instruction we'll need to
821 * insert a load for.
822 */
823
get_available_regs(sched_ctx * ctx,gpir_node * node,int * min_index)824 static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
825 int *min_index)
826 {
827 uint64_t available = ~0ull;
828 gpir_node_foreach_succ(node, dep) {
829 if (dep->type != GPIR_DEP_INPUT)
830 continue;
831
832 gpir_node *use = dep->succ;
833 gpir_instr *instr = use->sched.instr;
834
835 if (!instr) {
836 /* This use isn't scheduled, so no need to spill it. */
837 continue;
838 }
839
840 if (use->type == gpir_node_type_store) {
841 /* We're trying to spill something that was recently stored... just
842 * bail out.
843 */
844 return 0;
845 }
846
847 if (use->op == gpir_op_mov && instr == ctx->instr) {
848 /* We try to spill the sources of this move, so we can free up space
849 * in the current instruction.
850 *
851 * TODO: should we go back further? It might let us schedule the
852 * write earlier in some cases, but then we might fail to spill.
853 */
854 available &= get_available_regs(ctx, use, min_index);
855 } else {
856 if (instr->index < *min_index)
857 *min_index = instr->index;
858
859 uint64_t use_available = 0;
860
861 if (instr->reg0_use_count == 0)
862 use_available = ~0ull;
863 else if (!instr->reg0_is_attr)
864 use_available = 0xfull << (4 * instr->reg0_index);
865
866 if (instr->reg1_use_count == 0)
867 use_available = ~0ull;
868 else
869 use_available |= 0xfull << (4 * instr->reg1_index);
870
871 available &= use_available;
872 }
873 }
874
875 return available;
876 }
877
878 /* Using "min_index" returned by get_available_regs(), figure out which
879 * registers are killed by a write after or during the current instruction and
880 * hence we can't use for spilling. Writes that haven't been scheduled yet
881 * should be reflected in live_physregs.
882 */
883
get_killed_regs(sched_ctx * ctx,int min_index)884 static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
885 {
886 uint64_t killed = 0;
887
888 list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
889 if (instr->index <= min_index)
890 break;
891
892 for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
893 slot++) {
894 if (!instr->slots[slot])
895 continue;
896
897 gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
898 if (store->node.op != gpir_op_store_reg)
899 continue;
900
901 killed |= 1ull << (4 * store->index + store->component);
902 }
903 }
904
905 return killed;
906 }
907
908 /* Actually spill a node so that it is no longer in the ready list. Note that
909 * this must exactly follow the logic of get_available_regs() or else the
910 * loads could fail to schedule.
911 */
912
spill_node(sched_ctx * ctx,gpir_node * node,gpir_store_node * store)913 static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
914 {
915 gpir_node_foreach_succ_safe(node, dep) {
916 if (dep->type != GPIR_DEP_INPUT)
917 continue;
918
919 gpir_node *use = dep->succ;
920 gpir_instr *instr = use->sched.instr;
921
922 if (!instr)
923 continue;
924
925 if (use->op == gpir_op_mov && instr == ctx->instr) {
926 spill_node(ctx, use, store);
927 } else {
928 gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
929 load->index = store->index;
930 load->component = store->component;
931 list_add(&load->node.list, &ctx->block->node_list);
932 gpir_node_replace_child(dep->succ, dep->pred, &load->node);
933 gpir_node_replace_pred(dep, &load->node);
934 gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
935 gpir_debug("spilling use %d of node %d to load node %d\n",
936 use->index, node->index, load->node.index);
937 ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
938 assert(result);
939 }
940 }
941
942 if (node->op == gpir_op_mov) {
943 /* We replaced all the uses of the move, so it's dead now. */
944 gpir_instr_remove_node(node->sched.instr, node);
945 gpir_node_delete(node);
946 } else {
947 /* We deleted all the uses of the node except the store, so it's not
948 * live anymore.
949 */
950 list_del(&node->list);
951 node->sched.inserted = false;
952 ctx->ready_list_slots--;
953 if (node->sched.max_node) {
954 node->sched.max_node = false;
955 ctx->instr->alu_num_slot_needed_by_max--;
956 }
957 if (node->sched.next_max_node) {
958 node->sched.next_max_node = false;
959 ctx->instr->alu_num_unscheduled_next_max--;
960 }
961 }
962 }
963
used_by_store(gpir_node * node,gpir_instr * instr)964 static bool used_by_store(gpir_node *node, gpir_instr *instr)
965 {
966 gpir_node_foreach_succ(node, dep) {
967 if (dep->type != GPIR_DEP_INPUT)
968 continue;
969
970 if (dep->succ->type == gpir_node_type_store &&
971 dep->succ->sched.instr == instr)
972 return true;
973 }
974
975 return false;
976 }
977
consuming_postlog2(gpir_node * node)978 static gpir_node *consuming_postlog2(gpir_node *node)
979 {
980 if (node->op != gpir_op_complex1)
981 return NULL;
982
983 gpir_node_foreach_succ(node, dep) {
984 if (dep->type != GPIR_DEP_INPUT)
985 continue;
986 if (dep->succ->op == gpir_op_postlog2)
987 return dep->succ;
988 else
989 return NULL;
990 }
991
992 return NULL;
993 }
994
try_spill_node(sched_ctx * ctx,gpir_node * node)995 static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
996 {
997 assert(node->op != gpir_op_mov);
998
999 if (used_by_store(node, ctx->instr))
1000 return false;
1001
1002 gpir_debug("trying to spill %d\n", node->index);
1003
1004 int min_instr = INT_MAX;
1005 uint64_t available = get_available_regs(ctx, node, &min_instr);
1006 available &= ~get_killed_regs(ctx, min_instr);
1007
1008 if (node->sched.physreg_store) {
1009 gpir_store_node *store = node->sched.physreg_store;
1010 if (!(available & (1ull << (4 * store->index + store->component))))
1011 return false;
1012 } else {
1013 available &= ~ctx->live_physregs;
1014
1015 if (available == 0)
1016 return false;
1017
1018 /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
1019 * move, replace the complex1 with postlog2 and spill that instead. The
1020 * store needs a move anyways so the postlog2 is usually free.
1021 */
1022 gpir_node *postlog2 = consuming_postlog2(node);
1023 if (postlog2) {
1024 postlog2->op = gpir_op_mov;
1025 node = create_postlog2(ctx, node);
1026 }
1027
1028 /* TODO: use a better heuristic for choosing an available register? */
1029 int physreg = ffsll(available) - 1;
1030
1031 ctx->live_physregs |= (1ull << physreg);
1032
1033 gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1034 store->index = physreg / 4;
1035 store->component = physreg % 4;
1036 store->child = node;
1037 store->node.sched.max_node = false;
1038 store->node.sched.next_max_node = false;
1039 store->node.sched.complex_allowed = false;
1040 store->node.sched.pos = -1;
1041 store->node.sched.instr = NULL;
1042 store->node.sched.inserted = false;
1043 store->node.sched.dist = node->sched.dist;
1044 if (node->op == gpir_op_complex1) {
1045 /* Complex1 cannot be directly stored, and has a latency of 2 */
1046 store->node.sched.dist += 2;
1047 }
1048 node->sched.physreg_store = store;
1049 gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1050
1051 list_for_each_entry(gpir_load_node, load,
1052 &ctx->physreg_reads[physreg], reg_link) {
1053 gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
1054 if (load->node.sched.ready) {
1055 list_del(&load->node.list);
1056 load->node.sched.ready = false;
1057 }
1058 }
1059
1060 node->sched.ready = false;
1061 schedule_insert_ready_list(ctx, &store->node);
1062 }
1063
1064 gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1065 node->sched.physreg_store->index,
1066 "xyzw"[node->sched.physreg_store->component],
1067 node->sched.physreg_store->node.index);
1068
1069 spill_node(ctx, node, node->sched.physreg_store);
1070
1071 return true;
1072 }
1073
try_spill_nodes(sched_ctx * ctx,gpir_node * orig_node)1074 static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1075 {
1076 /* First, try to spill max nodes. */
1077 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078 if (ctx->max_node_spill_needed <= 0)
1079 break;
1080
1081 /* orig_node is the node we're trying to schedule, so spilling it makes
1082 * no sense. Also don't try to spill any nodes in front of it, since
1083 * they might be scheduled instead.
1084 */
1085 if (node == orig_node)
1086 break;
1087
1088 if (node->op == gpir_op_mov) {
1089 /* Don't try to spill loads, since that only adds another load and
1090 * store which is likely pointless.
1091 */
1092 continue;
1093 }
1094
1095 if (!gpir_is_input_node(node) || !node->sched.max_node)
1096 continue;
1097
1098 if (try_spill_node(ctx, node)) {
1099 ctx->max_node_spill_needed--;
1100 ctx->total_spill_needed--;
1101 }
1102 }
1103
1104 /* Now, try to spill the remaining nodes. */
1105 list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1106 if (ctx->total_spill_needed <= 0)
1107 break;
1108
1109 if (node == orig_node)
1110 break;
1111
1112 if (node->op == gpir_op_mov)
1113 continue;
1114
1115 if (!gpir_is_input_node(node) ||
1116 !(node->sched.max_node || node->sched.next_max_node))
1117 continue;
1118
1119 if (try_spill_node(ctx, node))
1120 ctx->total_spill_needed--;
1121 }
1122
1123 return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1124 }
1125
gpir_get_curr_ready_list_slots(sched_ctx * ctx)1126 static int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1127 {
1128 int total = 0;
1129 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1130 total += gpir_get_slots_required(node);
1131 }
1132
1133 return total;
1134 }
1135
1136 /* What gpir_get_min_end() would return if node were replaced with a move
1137 * instruction not in the complex slot. Normally this is 2 + min_end, except
1138 * for some store instructions which must have the move node in the same
1139 * instruction.
1140 */
gpir_get_min_end_as_move(gpir_node * node)1141 static int gpir_get_min_end_as_move(gpir_node *node)
1142 {
1143 int min = INT_MAX;
1144 gpir_node_foreach_succ(node, dep) {
1145 gpir_node *succ = dep->succ;
1146 if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1147 switch (succ->op) {
1148 case gpir_op_store_temp:
1149 case gpir_op_store_reg:
1150 case gpir_op_store_varying:
1151 continue;
1152 default:
1153 break;
1154 }
1155 if (min > succ->sched.instr->index + 2)
1156 min = succ->sched.instr->index + 2;
1157 }
1158 }
1159 return min;
1160 }
1161
1162 /* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1163 * The hardware overwrites the add second sources with 0 and mul second
1164 * sources with 1. This can be a problem if we need to insert more next-max
1165 * moves but we only have values that can't use the complex unit for moves.
1166 *
1167 * Fortunately, we only need to insert a next-max move if there are more than
1168 * 5 next-max nodes, but there are only 4 sources in the previous instruction
1169 * that make values not complex-capable, which means there can be at most 4
1170 * non-complex-capable values. Hence there will always be at least two values
1171 * that can be rewritten to use a move in the complex slot. However, we have
1172 * to be careful not to waste those values by putting both of them in a
1173 * non-complex slot. This is handled for us by gpir_instr, which will reject
1174 * such instructions. We just need to tell it which nodes can use complex, and
1175 * it will do the accounting to figure out what is safe.
1176 */
1177
can_use_complex(gpir_node * node)1178 static bool can_use_complex(gpir_node *node)
1179 {
1180 gpir_node_foreach_succ(node, dep) {
1181 if (dep->type != GPIR_DEP_INPUT)
1182 continue;
1183
1184 gpir_node *succ = dep->succ;
1185 if (succ->type != gpir_node_type_alu ||
1186 !succ->sched.instr)
1187 continue;
1188
1189 /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1190 */
1191 gpir_alu_node *alu = gpir_node_to_alu(succ);
1192 switch (alu->node.op) {
1193 case gpir_op_complex1:
1194 /* complex1 puts its third source in the fourth slot */
1195 if (alu->children[1] == node || alu->children[2] == node)
1196 return false;
1197 break;
1198 case gpir_op_complex2:
1199 /* complex2 has its source duplicated, since it actually takes two
1200 * sources but we only ever use it with both sources the same. Hence
1201 * its source can never be the complex slot.
1202 */
1203 return false;
1204 case gpir_op_select:
1205 /* Select has its sources rearranged */
1206 if (alu->children[0] == node)
1207 return false;
1208 break;
1209 default:
1210 assert(alu->num_child <= 2);
1211 if (alu->num_child == 2 && alu->children[1] == node)
1212 return false;
1213 break;
1214 }
1215 }
1216
1217 return true;
1218 }
1219
1220 /* Initialize node->sched.max_node and node->sched.next_max_node for every
1221 * input node on the ready list. We should only need to do this once per
1222 * instruction, at the beginning, since we never add max nodes to the ready
1223 * list.
1224 */
1225
sched_find_max_nodes(sched_ctx * ctx)1226 static void sched_find_max_nodes(sched_ctx *ctx)
1227 {
1228 ctx->instr->alu_num_unscheduled_next_max = 0;
1229 ctx->instr->alu_num_slot_needed_by_max = 0;
1230
1231 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1232 if (!gpir_is_input_node(node))
1233 continue;
1234
1235 int min_end_move = gpir_get_min_end_as_move(node);
1236 node->sched.max_node = (min_end_move == ctx->instr->index);
1237 node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1238 if (node->sched.next_max_node)
1239 node->sched.complex_allowed = can_use_complex(node);
1240
1241 if (node->sched.max_node)
1242 ctx->instr->alu_num_slot_needed_by_max++;
1243 if (node->sched.next_max_node)
1244 ctx->instr->alu_num_unscheduled_next_max++;
1245 }
1246 }
1247
1248 /* Verify the invariants described in gpir.h, as well as making sure the
1249 * counts are correct.
1250 */
verify_max_nodes(sched_ctx * ctx)1251 static void ASSERTED verify_max_nodes(sched_ctx *ctx)
1252 {
1253 int alu_num_slot_needed_by_max = 0;
1254 int alu_num_unscheduled_next_max = 0;
1255 int alu_num_slot_needed_by_store = 0;
1256 int alu_num_slot_needed_by_non_cplx_store = 0;
1257 ASSERTED int alu_max_allowed_next_max = 5;
1258
1259 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1260 if (!gpir_is_input_node(node))
1261 continue;
1262
1263 if (node->sched.max_node)
1264 alu_num_slot_needed_by_max++;
1265 if (node->sched.next_max_node)
1266 alu_num_unscheduled_next_max++;
1267 if (used_by_store(node, ctx->instr)) {
1268 alu_num_slot_needed_by_store++;
1269 if (node->sched.next_max_node && !node->sched.complex_allowed)
1270 alu_num_slot_needed_by_non_cplx_store++;
1271 }
1272 }
1273
1274 if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1275 ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1276 alu_max_allowed_next_max = 4;
1277
1278 assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1279 assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1280 assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1281 assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1282 assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1283 alu_num_slot_needed_by_non_cplx_store);
1284 assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1285 assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1286 }
1287
try_node(sched_ctx * ctx)1288 static bool try_node(sched_ctx *ctx)
1289 {
1290 gpir_node *best_node = NULL;
1291 int best_score = INT_MIN;
1292
1293 /* Spilling will delete arbitrary nodes after the current one in the ready
1294 * list, which means that we always need to look up the next node in the
1295 * list at the end of each iteration. While list_for_each_entry() works for
1296 * this purpose, its sanity checking assumes that you don't want to modify
1297 * the list at all. We know better here, so we have to open-code
1298 * list_for_each_entry() without the check in order to not assert.
1299 */
1300 for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
1301 &node->list != &ctx->ready_list;
1302 node = LIST_ENTRY(gpir_node, node->list.next, list)) {
1303 if (best_score != INT_MIN) {
1304 if (node->sched.dist < best_node->sched.dist)
1305 break;
1306 }
1307
1308 if (node->sched.ready) {
1309 ctx->total_spill_needed = 0;
1310 ctx->max_node_spill_needed = 0;
1311 int score = schedule_try_node(ctx, node, true);
1312 if (score == INT_MIN && !best_node &&
1313 ctx->total_spill_needed > 0 &&
1314 try_spill_nodes(ctx, node)) {
1315 score = schedule_try_node(ctx, node, true);
1316 }
1317
1318 /* schedule_first nodes must be scheduled if possible */
1319 if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1320 best_node = node;
1321 best_score = score;
1322 break;
1323 }
1324
1325 if (score > best_score) {
1326 best_score = score;
1327 best_node = node;
1328 }
1329 }
1330 }
1331
1332 if (best_node) {
1333 gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1334 best_score, best_node->sched.max_node ? " (max)" : "");
1335 ASSERTED int score = schedule_try_node(ctx, best_node, false);
1336 assert(score != INT_MIN);
1337 return true;
1338 }
1339
1340 return false;
1341 }
1342
place_move(sched_ctx * ctx,gpir_node * node)1343 static void place_move(sched_ctx *ctx, gpir_node *node)
1344 {
1345 /* For complex1 that is consumed by a postlog2, we cannot allow any moves
1346 * in between. Convert the postlog2 to a move and insert a new postlog2,
1347 * and try to schedule it again in try_node().
1348 */
1349 gpir_node *postlog2 = consuming_postlog2(node);
1350 if (postlog2) {
1351 postlog2->op = gpir_op_mov;
1352 create_postlog2(ctx, node);
1353 return;
1354 }
1355
1356 gpir_node *move = create_move(ctx, node);
1357 gpir_node_foreach_succ_safe(move, dep) {
1358 gpir_node *succ = dep->succ;
1359 if (!succ->sched.instr ||
1360 ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1361 gpir_node_replace_pred(dep, node);
1362 if (dep->type == GPIR_DEP_INPUT)
1363 gpir_node_replace_child(succ, move, node);
1364 }
1365 }
1366 ASSERTED int score = schedule_try_node(ctx, move, false);
1367 assert(score != INT_MIN);
1368 }
1369
1370 /* For next-max nodes, not every node can be offloaded to a move in the
1371 * complex slot. If we run out of non-complex slots, then such nodes cannot
1372 * have moves placed for them. There should always be sufficient
1373 * complex-capable nodes so that this isn't a problem. We also disallow moves
1374 * for schedule_first nodes here.
1375 */
can_place_move(sched_ctx * ctx,gpir_node * node)1376 static bool can_place_move(sched_ctx *ctx, gpir_node *node)
1377 {
1378 if (gpir_op_infos[node->op].schedule_first)
1379 return false;
1380
1381 if (!node->sched.next_max_node)
1382 return true;
1383
1384 if (node->sched.complex_allowed)
1385 return true;
1386
1387 return ctx->instr->alu_non_cplx_slot_free > 0;
1388 }
1389
sched_move(sched_ctx * ctx)1390 static bool sched_move(sched_ctx *ctx)
1391 {
1392 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1393 if (node->sched.max_node) {
1394 place_move(ctx, node);
1395 return true;
1396 }
1397 }
1398
1399 if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1400 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1401 if (used_by_store(node, ctx->instr)) {
1402 place_move(ctx, node);
1403 /* If we have a store of a load, then we need to make sure that we
1404 * immediately schedule the dependent load, or create a move
1405 * instruction for it, like we would with a normal instruction.
1406 * The rest of the code isn't set up to handle load nodes in the
1407 * ready list -- see the comments in _schedule_try_node().
1408 */
1409 if (node->type == gpir_node_type_load) {
1410 if (!schedule_try_place_node(ctx, node, false)) {
1411 create_move(ctx, node);
1412 }
1413 }
1414 return true;
1415 }
1416 }
1417 }
1418
1419 /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1420 * Once it is fully ready, we need to group all its uses in the same
1421 * instruction, and then we need to avoid creating any moves in the next
1422 * cycle in order to get it scheduled. Failing to do any of these things
1423 * could result in a cycle penalty, or even worse, an infinite loop of
1424 * inserting moves. If it is a next-max node and ready, then it has a use
1425 * in the previous cycle. If it has a use in the current cycle as well,
1426 * then we want to insert a move node to make it ready in two cycles -- if
1427 * we don't, then there will be at least a one cycle penalty. Otherwise, it
1428 * will be ready next cycle, and we shouldn't insert a move node, or else
1429 * we'll also have a one cycle penalty.
1430 */
1431 if (ctx->instr->alu_num_slot_free > 0) {
1432 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1433 if (!can_place_move(ctx, node))
1434 continue;
1435
1436 if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1437 node->sched.ready) {
1438 bool skip = true;
1439 gpir_node_foreach_succ(node, dep) {
1440 if (dep->type != GPIR_DEP_INPUT)
1441 continue;
1442
1443 gpir_node *succ = dep->succ;
1444
1445 if (!succ->sched.instr ||
1446 succ->sched.instr->index != ctx->instr->index - 1) {
1447 skip = false;
1448 break;
1449 }
1450 }
1451
1452 if (skip)
1453 continue;
1454
1455 place_move(ctx, node);
1456 return true;
1457 }
1458 }
1459 }
1460
1461 /* Once we've made all the required moves, we're free to use any extra
1462 * slots to schedule more moves for next max nodes. Besides sometimes being
1463 * necessary, this can free up extra space in the next instruction. We walk
1464 * from back to front so that we pick nodes less likely to be scheduled
1465 * next first -- an extra move would be unnecessary there. But make sure
1466 * not to handle the complex1 case handled above.
1467 */
1468 if (ctx->instr->alu_num_slot_free > 0) {
1469 list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1470 if (!can_place_move(ctx, node))
1471 continue;
1472
1473 if (node->sched.next_max_node &&
1474 !(node->op == gpir_op_complex1 && node->sched.ready)) {
1475 place_move(ctx, node);
1476 return true;
1477 }
1478 }
1479 }
1480
1481 /* We may have skipped complex1 above, but if we run out of space, we still
1482 * need to insert the move.
1483 */
1484
1485 if (ctx->instr->alu_num_unscheduled_next_max >
1486 ctx->instr->alu_max_allowed_next_max) {
1487 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1488 if (!can_place_move(ctx, node))
1489 continue;
1490
1491 if (node->sched.next_max_node) {
1492 place_move(ctx, node);
1493 return true;
1494 }
1495 }
1496 }
1497
1498
1499 return false;
1500 }
1501
gpir_sched_instr_pass(sched_ctx * ctx)1502 static bool gpir_sched_instr_pass(sched_ctx *ctx)
1503 {
1504 if (try_node(ctx))
1505 return true;
1506
1507 if (sched_move(ctx))
1508 return true;
1509
1510 return false;
1511 }
1512
schedule_print_pre_one_instr(sched_ctx * ctx)1513 static void schedule_print_pre_one_instr(sched_ctx *ctx)
1514 {
1515 if (!(lima_debug & LIMA_DEBUG_GP))
1516 return;
1517
1518 printf("instr %d for ready list:", ctx->instr->index);
1519 list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1520 printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1521 node->sched.dist, gpir_get_slots_required(node),
1522 node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1523 }
1524 printf("\nlive physregs: ");
1525 for (unsigned i = 0; i < 16; i++) {
1526 if (ctx->live_physregs & (0xfull << (4 * i))) {
1527 printf("$%d.", i);
1528 for (unsigned j = 0; j < 4; j++) {
1529 if (ctx->live_physregs & (1ull << (4 * i + j)))
1530 printf("%c", "xyzw"[j]);
1531 }
1532 printf(" ");
1533 }
1534 }
1535 printf("\n");
1536 }
1537
schedule_print_post_one_instr(gpir_instr * instr)1538 static void schedule_print_post_one_instr(gpir_instr *instr)
1539 {
1540 if (!(lima_debug & LIMA_DEBUG_GP))
1541 return;
1542
1543 printf("post schedule instr");
1544 for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1545 if (instr->slots[i])
1546 printf(" %d/%d", i, instr->slots[i]->index);
1547 }
1548 printf("\n");
1549 }
1550
1551
schedule_one_instr(sched_ctx * ctx)1552 static bool schedule_one_instr(sched_ctx *ctx)
1553 {
1554 gpir_instr *instr = gpir_instr_create(ctx->block);
1555 if (unlikely(!instr))
1556 return false;
1557
1558 ctx->instr = instr;
1559
1560 sched_find_max_nodes(ctx);
1561 schedule_print_pre_one_instr(ctx);
1562
1563 while (gpir_sched_instr_pass(ctx)) {
1564 assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1565 #ifndef NDEBUG
1566 verify_max_nodes(ctx);
1567 verify_ready_list(ctx);
1568 #endif
1569 }
1570
1571 schedule_print_post_one_instr(instr);
1572 return true;
1573 }
1574
schedule_block(gpir_block * block)1575 static bool schedule_block(gpir_block *block)
1576 {
1577 /* calculate distance */
1578 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1579 if (gpir_node_is_root(node))
1580 schedule_update_distance(node);
1581 }
1582
1583 sched_ctx ctx;
1584 list_inithead(&ctx.ready_list);
1585 ctx.block = block;
1586 ctx.ready_list_slots = 0;
1587 ctx.live_physregs = block->live_out_phys;
1588
1589 for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
1590 list_inithead(&ctx.physreg_reads[i]);
1591 }
1592
1593 /* construct the ready list from root nodes */
1594 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1595 /* Add to physreg_reads */
1596 if (node->op == gpir_op_load_reg) {
1597 gpir_load_node *load = gpir_node_to_load(node);
1598 unsigned index = 4 * load->index + load->component;
1599 list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
1600 }
1601
1602 if (gpir_node_is_root(node))
1603 schedule_insert_ready_list(&ctx, node);
1604 }
1605
1606 list_inithead(&block->node_list);
1607 while (!list_is_empty(&ctx.ready_list)) {
1608 if (!schedule_one_instr(&ctx))
1609 return false;
1610 }
1611
1612 return true;
1613 }
1614
add_fake_dep(gpir_node * node,gpir_node * dep_node,gpir_node * last_written[])1615 static void add_fake_dep(gpir_node *node, gpir_node *dep_node,
1616 gpir_node *last_written[])
1617 {
1618 gpir_node_foreach_pred(node, dep) {
1619 if (dep->type == GPIR_DEP_INPUT) {
1620 int index = dep->pred->value_reg;
1621 if (index >= 0 && last_written[index]) {
1622 gpir_node_add_dep(last_written[index], dep_node,
1623 GPIR_DEP_WRITE_AFTER_READ);
1624 }
1625 if (gpir_op_infos[dep->pred->op].schedule_first) {
1626 /* Insert fake dependencies for any schedule_first children on
1627 * this node as well. This guarantees that as soon as
1628 * "dep_node" is ready to schedule, all of its schedule_first
1629 * children, grandchildren, etc. are ready so that they can be
1630 * scheduled as soon as possible.
1631 */
1632 add_fake_dep(dep->pred, dep_node, last_written);
1633 }
1634 }
1635 }
1636 }
1637
schedule_build_dependency(gpir_block * block)1638 static void schedule_build_dependency(gpir_block *block)
1639 {
1640 gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
1641
1642 /* merge dummy_f/m to the node created from */
1643 list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1644 if (node->op == gpir_op_dummy_m) {
1645 gpir_alu_node *alu = gpir_node_to_alu(node);
1646 gpir_node *origin = alu->children[0];
1647 gpir_node *dummy_f = alu->children[1];
1648
1649 gpir_node_foreach_succ(node, dep) {
1650 gpir_node *succ = dep->succ;
1651 /* origin and node may have same succ (by VREG/INPUT or
1652 * VREG/VREG dep), so use gpir_node_add_dep() instead of
1653 * gpir_node_replace_pred() */
1654 gpir_node_add_dep(succ, origin, dep->type);
1655 gpir_node_replace_child(succ, node, origin);
1656 }
1657 gpir_node_delete(dummy_f);
1658 gpir_node_delete(node);
1659 }
1660 }
1661
1662 memset(last_written, 0, sizeof(last_written));
1663
1664 /* False dependencies. For value registers, these exist only to make sure
1665 * that the maximum pressure isn't exceeded and are hence "fake".
1666 */
1667 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
1668 if (node->op == gpir_op_load_reg) {
1669 gpir_load_node *load = gpir_node_to_load(node);
1670 unsigned index = 4 * load->index + load->component;
1671 if (last_written[index]) {
1672 gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
1673 }
1674 } else if (node->op == gpir_op_store_reg) {
1675 gpir_store_node *store = gpir_node_to_store(node);
1676 unsigned index = 4 * store->index + store->component;
1677 last_written[index] = node;
1678 } else {
1679 add_fake_dep(node, node, last_written);
1680 }
1681
1682 if (node->value_reg >= 0)
1683 last_written[node->value_reg] = node;
1684 }
1685 }
1686
print_statistic(gpir_compiler * comp,int save_index)1687 static void print_statistic(gpir_compiler *comp, int save_index)
1688 {
1689 int num_nodes[gpir_op_num] = {0};
1690 int num_created_nodes[gpir_op_num] = {0};
1691
1692 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1693 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1694 num_nodes[node->op]++;
1695 if (node->index >= save_index)
1696 num_created_nodes[node->op]++;
1697 }
1698 }
1699
1700 printf("====== gpir scheduler statistic ======\n");
1701 printf("---- how many nodes are scheduled ----\n");
1702 int n = 0, l = 0;
1703 for (int i = 0; i < gpir_op_num; i++) {
1704 if (num_nodes[i]) {
1705 printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1706 n += num_nodes[i];
1707 if (!(++l % 4))
1708 printf("\n");
1709 }
1710 }
1711 if (l % 4)
1712 printf("\n");
1713 printf("\ntotal: %d\n", n);
1714
1715 printf("---- how many nodes are created ----\n");
1716 n = l = 0;
1717 for (int i = 0; i < gpir_op_num; i++) {
1718 if (num_created_nodes[i]) {
1719 printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1720 n += num_created_nodes[i];
1721 if (!(++l % 4))
1722 printf("\n");
1723 }
1724 }
1725 if (l % 4)
1726 printf("\n");
1727 printf("\ntotal: %d\n", n);
1728 printf("------------------------------------\n");
1729 }
1730
gpir_schedule_prog(gpir_compiler * comp)1731 bool gpir_schedule_prog(gpir_compiler *comp)
1732 {
1733 int save_index = comp->cur_index;
1734
1735 /* init schedule info */
1736 int index = 0;
1737 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1738 block->sched.instr_index = 0;
1739 list_for_each_entry(gpir_node, node, &block->node_list, list) {
1740 node->sched.instr = NULL;
1741 node->sched.pos = -1;
1742 node->sched.index = index++;
1743 node->sched.dist = -1;
1744 /* TODO when we support multiple basic blocks, we need a way to keep
1745 * track of this for physregs allocated before the scheduler.
1746 */
1747 node->sched.physreg_store = NULL;
1748 node->sched.ready = false;
1749 node->sched.inserted = false;
1750 node->sched.complex_allowed = false;
1751 node->sched.max_node = false;
1752 node->sched.next_max_node = false;
1753 }
1754 }
1755
1756 /* build dependency */
1757 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1758 schedule_build_dependency(block);
1759 }
1760
1761 //gpir_debug("after scheduler build reg dependency\n");
1762 //gpir_node_print_prog_dep(comp);
1763
1764 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1765 if (!schedule_block(block)) {
1766 gpir_error("fail schedule block\n");
1767 return false;
1768 }
1769 }
1770
1771 if (lima_debug & LIMA_DEBUG_GP) {
1772 print_statistic(comp, save_index);
1773 gpir_instr_print_prog(comp);
1774 }
1775
1776 return true;
1777 }
1778