1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28
29 #define ACC_INDEX 0
30 #define ACC_COUNT 6
31
32 /* RA nodes used to track RF registers with implicit writes */
33 #define IMPLICIT_RF_COUNT 1
34
35 #define PHYS_COUNT 64
36
37 static uint8_t
get_phys_index(const struct v3d_device_info * devinfo)38 get_phys_index(const struct v3d_device_info *devinfo)
39 {
40 if (devinfo->has_accumulators)
41 return ACC_INDEX + ACC_COUNT;
42 else
43 return 0;
44 }
45
46 /* ACC as accumulator */
47 #define CLASS_BITS_PHYS (1 << 0)
48 #define CLASS_BITS_ACC (1 << 1)
49 #define CLASS_BITS_R5 (1 << 4)
50
51 static inline bool
stage_has_payload(struct v3d_compile * c)52 stage_has_payload(struct v3d_compile *c)
53 {
54 return c->s->info.stage == MESA_SHADER_FRAGMENT ||
55 c->s->info.stage == MESA_SHADER_COMPUTE;
56 }
57
58 static uint8_t
get_class_bit_any(const struct v3d_device_info * devinfo)59 get_class_bit_any(const struct v3d_device_info *devinfo)
60 {
61 if (devinfo->has_accumulators)
62 return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
63 else
64 return CLASS_BITS_PHYS;
65 }
66
67 static uint8_t
filter_class_bits(const struct v3d_device_info * devinfo,uint8_t class_bits)68 filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
69 {
70 if (!devinfo->has_accumulators) {
71 assert(class_bits & CLASS_BITS_PHYS);
72 class_bits = CLASS_BITS_PHYS;
73 }
74 return class_bits;
75 }
76
77 static inline uint32_t
temp_to_node(struct v3d_compile * c,uint32_t temp)78 temp_to_node(struct v3d_compile *c, uint32_t temp)
79 {
80 return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
81 IMPLICIT_RF_COUNT);
82 }
83
84 static inline uint32_t
node_to_temp(struct v3d_compile * c,uint32_t node)85 node_to_temp(struct v3d_compile *c, uint32_t node)
86 {
87 assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
88 (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
89 return node - (c->devinfo->has_accumulators ? ACC_COUNT :
90 IMPLICIT_RF_COUNT);
91 }
92
93 static inline uint8_t
get_temp_class_bits(struct v3d_compile * c,uint32_t temp)94 get_temp_class_bits(struct v3d_compile *c,
95 uint32_t temp)
96 {
97 return c->nodes.info[temp_to_node(c, temp)].class_bits;
98 }
99
100 static inline void
set_temp_class_bits(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)101 set_temp_class_bits(struct v3d_compile *c,
102 uint32_t temp, uint8_t class_bits)
103 {
104 c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
105 }
106
107 static struct ra_class *
choose_reg_class(struct v3d_compile * c,uint8_t class_bits)108 choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
109 {
110 if (class_bits == CLASS_BITS_PHYS) {
111 return c->compiler->reg_class_phys[c->thread_index];
112 } else if (class_bits == (CLASS_BITS_R5)) {
113 assert(c->devinfo->has_accumulators);
114 return c->compiler->reg_class_r5[c->thread_index];
115 } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
116 assert(c->devinfo->has_accumulators);
117 return c->compiler->reg_class_phys_or_acc[c->thread_index];
118 } else {
119 assert(class_bits == get_class_bit_any(c->devinfo));
120 return c->compiler->reg_class_any[c->thread_index];
121 }
122 }
123
124 static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile * c,uint32_t temp)125 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
126 {
127 assert(temp < c->num_temps && temp < c->nodes.alloc_count);
128 return choose_reg_class(c, get_temp_class_bits(c, temp));
129 }
130
131 static inline bool
qinst_writes_tmu(const struct v3d_device_info * devinfo,struct qinst * inst)132 qinst_writes_tmu(const struct v3d_device_info *devinfo,
133 struct qinst *inst)
134 {
135 return (inst->dst.file == QFILE_MAGIC &&
136 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||
137 inst->qpu.sig.wrtmuc;
138 }
139
140 static bool
is_end_of_tmu_sequence(const struct v3d_device_info * devinfo,struct qinst * inst,struct qblock * block)141 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
142 struct qinst *inst, struct qblock *block)
143 {
144 /* Only tmuwt and ldtmu can finish TMU sequences */
145 bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
146 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
147 bool is_ldtmu = inst->qpu.sig.ldtmu;
148 if (!is_tmuwt && !is_ldtmu)
149 return false;
150
151 /* Check if this is the last tmuwt or ldtmu in the sequence */
152 list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
153 &block->instructions, link) {
154 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
155 scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
156 is_ldtmu = scan_inst->qpu.sig.ldtmu;
157
158 if (is_tmuwt || is_ldtmu)
159 return false;
160
161 if (qinst_writes_tmu(devinfo, scan_inst))
162 return true;
163 }
164
165 return true;
166 }
167
168 static bool
vir_is_mov_uniform(struct v3d_compile * c,int temp)169 vir_is_mov_uniform(struct v3d_compile *c, int temp)
170 {
171 struct qinst *def = c->defs[temp];
172
173 return def && def->qpu.sig.ldunif;
174 }
175
176 static bool
can_reconstruct_inst(struct qinst * inst)177 can_reconstruct_inst(struct qinst *inst)
178 {
179 assert(inst);
180
181 if (vir_is_add(inst)) {
182 switch (inst->qpu.alu.add.op) {
183 case V3D_QPU_A_FXCD:
184 case V3D_QPU_A_FYCD:
185 case V3D_QPU_A_XCD:
186 case V3D_QPU_A_YCD:
187 case V3D_QPU_A_IID:
188 case V3D_QPU_A_EIDX:
189 case V3D_QPU_A_TIDX:
190 case V3D_QPU_A_SAMPID:
191 /* No need to check input unpacks because none of these
192 * opcodes read sources. FXCD,FYCD have pack variants.
193 */
194 return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
195 inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
196 inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
197 inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
198 default:
199 return false;
200 }
201 }
202
203 return false;
204 }
205
206 static bool
can_reconstruct_temp(struct v3d_compile * c,int temp)207 can_reconstruct_temp(struct v3d_compile *c, int temp)
208 {
209 struct qinst *def = c->defs[temp];
210 return def && can_reconstruct_inst(def);
211 }
212
213 static struct qreg
reconstruct_temp(struct v3d_compile * c,enum v3d_qpu_add_op op)214 reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
215 {
216 struct qreg dest;
217 switch (op) {
218 case V3D_QPU_A_FXCD:
219 dest = vir_FXCD(c);
220 break;
221 case V3D_QPU_A_FYCD:
222 dest = vir_FYCD(c);
223 break;
224 case V3D_QPU_A_XCD:
225 dest = vir_XCD(c);
226 break;
227 case V3D_QPU_A_YCD:
228 dest = vir_YCD(c);
229 break;
230 case V3D_QPU_A_IID:
231 dest = vir_IID(c);
232 break;
233 case V3D_QPU_A_EIDX:
234 dest = vir_EIDX(c);
235 break;
236 case V3D_QPU_A_TIDX:
237 dest = vir_TIDX(c);
238 break;
239 case V3D_QPU_A_SAMPID:
240 dest = vir_SAMPID(c);
241 break;
242 default:
243 unreachable("Unexpected opcode for reconstruction");
244 }
245
246 return dest;
247 }
248
249 enum temp_spill_type {
250 SPILL_TYPE_UNIFORM,
251 SPILL_TYPE_RECONSTRUCT,
252 SPILL_TYPE_TMU
253 };
254
255 static enum temp_spill_type
get_spill_type_for_temp(struct v3d_compile * c,int temp)256 get_spill_type_for_temp(struct v3d_compile *c, int temp)
257 {
258 if (vir_is_mov_uniform(c, temp))
259 return SPILL_TYPE_UNIFORM;
260
261 if (can_reconstruct_temp(c, temp))
262 return SPILL_TYPE_RECONSTRUCT;
263
264 return SPILL_TYPE_TMU;
265 }
266
267 static int
v3d_choose_spill_node(struct v3d_compile * c)268 v3d_choose_spill_node(struct v3d_compile *c)
269 {
270 assert(c->num_temps > 1);
271
272 const float tmu_scale = 10;
273 float block_scale = 1.0;
274 float spill_costs[c->num_temps];
275 bool in_tmu_operation = false;
276 bool rtop_hazard = false;
277 bool started_last_seg = false;
278
279 for (unsigned i = 0; i < c->num_temps; i++)
280 spill_costs[i] = 0.0;
281
282 /* XXX: Scale the cost up when inside of a loop. */
283 vir_for_each_block(block, c) {
284 vir_for_each_inst(inst, block) {
285 /* RTOP is not preserved across thread switches, so
286 * we can't spill in the middle of multop + umul24.
287 */
288 bool is_multop = false;
289 bool is_umul24 = false;
290 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
291 if (inst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) {
292 is_multop = true;
293 rtop_hazard = true;
294 } else if (inst->qpu.alu.mul.op == V3D_QPU_M_UMUL24) {
295 is_umul24 = true;
296 }
297 }
298
299 /* We can't insert new thread switches after
300 * starting output writes.
301 */
302 bool no_spilling =
303 (c->threads > 1 && started_last_seg) ||
304 (c->max_tmu_spills == 0);
305
306 /* Discourage spilling of TMU operations */
307 for (int i = 0; i < vir_get_nsrc(inst); i++) {
308 if (inst->src[i].file != QFILE_TEMP)
309 continue;
310
311 int temp = inst->src[i].index;
312 enum temp_spill_type spill_type =
313 get_spill_type_for_temp(c, temp);
314
315 if (spill_type != SPILL_TYPE_TMU) {
316 spill_costs[temp] += block_scale;
317 } else if (!no_spilling && (!rtop_hazard || is_multop)) {
318 float tmu_op_scale = in_tmu_operation ?
319 3.0 : 1.0;
320 spill_costs[temp] += (block_scale *
321 tmu_scale *
322 tmu_op_scale);
323 } else {
324 BITSET_CLEAR(c->spillable, temp);
325 }
326 }
327
328 if (inst->dst.file == QFILE_TEMP) {
329 int temp = inst->dst.index;
330 enum temp_spill_type spill_type =
331 get_spill_type_for_temp(c, temp);
332
333 if (spill_type != SPILL_TYPE_TMU) {
334 /* We just rematerialize it later */
335 } else if (!no_spilling && (!rtop_hazard || is_umul24)) {
336 spill_costs[temp] += (block_scale *
337 tmu_scale);
338 } else {
339 BITSET_CLEAR(c->spillable, temp);
340 }
341 }
342
343 /* Refuse to spill a ldvary's dst, because that means
344 * that ldvary's r5 would end up being used across a
345 * thrsw.
346 */
347 if (inst->qpu.sig.ldvary) {
348 assert(inst->dst.file == QFILE_TEMP);
349 BITSET_CLEAR(c->spillable, inst->dst.index);
350 }
351
352 if (inst->is_last_thrsw)
353 started_last_seg = true;
354
355 /* Track when we're in between a TMU setup and the
356 * final LDTMU or TMUWT from that TMU setup. We
357 * penalize spills during that time.
358 */
359 if (is_end_of_tmu_sequence(c->devinfo, inst, block))
360 in_tmu_operation = false;
361
362 if (qinst_writes_tmu(c->devinfo, inst))
363 in_tmu_operation = true;
364
365 if (is_umul24)
366 rtop_hazard = false;
367 }
368 }
369
370 /* We always emit a "last thrsw" to ensure all our spilling occurs
371 * before the last thread section. See vir_emit_last_thrsw.
372 */
373 assert(started_last_seg);
374
375 for (unsigned i = 0; i < c->num_temps; i++) {
376 if (BITSET_TEST(c->spillable, i)) {
377 ra_set_node_spill_cost(c->g, temp_to_node(c, i),
378 spill_costs[i]);
379 }
380 }
381
382 return ra_get_best_spill_node(c->g);
383 }
384
385 static void
ensure_nodes(struct v3d_compile * c)386 ensure_nodes(struct v3d_compile *c)
387 {
388 if (c->num_temps < c->nodes.alloc_count)
389 return;
390
391 c->nodes.alloc_count *= 2;
392 c->nodes.info = reralloc_array_size(c,
393 c->nodes.info,
394 sizeof(c->nodes.info[0]),
395 c->nodes.alloc_count +
396 MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
397 }
398
399 /* Creates the interference node for a new temp. We use this to keep the node
400 * list updated during the spilling process, which generates new temps/nodes.
401 */
402 static int
add_node(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)403 add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
404 {
405 ensure_nodes(c);
406
407 int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
408 assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
409 node == temp + IMPLICIT_RF_COUNT);
410
411 /* We fill the node priority after we are done inserting spills */
412 c->nodes.info[node].class_bits = class_bits;
413 c->nodes.info[node].priority = 0;
414 c->nodes.info[node].try_rf0 = false;
415 c->nodes.info[node].is_program_end = false;
416 c->nodes.info[node].unused = false;
417 c->nodes.info[node].payload_conflict = false;
418
419 return node;
420 }
421
422 /* The spill offset for this thread takes a bit of setup, so do it once at
423 * program start.
424 */
425 void
v3d_setup_spill_base(struct v3d_compile * c)426 v3d_setup_spill_base(struct v3d_compile *c)
427 {
428 /* Setting up the spill base is done in the entry block; so change
429 * both the current block to emit and the cursor.
430 */
431 struct qblock *current_block = c->cur_block;
432 c->cur_block = vir_entry_block(c);
433 c->cursor = vir_before_block(c->cur_block);
434
435 int start_num_temps = c->num_temps;
436
437 /* Each thread wants to be in a separate region of the scratch space
438 * so that the QPUs aren't fighting over cache lines. We have the
439 * driver keep a single global spill BO rather than
440 * per-spilling-program BOs, so we need a uniform from the driver for
441 * what the per-thread scale is.
442 */
443 struct qreg thread_offset =
444 vir_UMUL(c,
445 vir_TIDX(c),
446 vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
447
448 /* Each channel in a reg is 4 bytes, so scale them up by that. */
449 struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
450 vir_uniform_ui(c, 2));
451
452 c->spill_base = vir_ADD(c,
453 vir_ADD(c, thread_offset, element_offset),
454 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
455
456 /* Make sure that we don't spill the spilling setup instructions. */
457 for (int i = start_num_temps; i < c->num_temps; i++) {
458 BITSET_CLEAR(c->spillable, i);
459
460 /* If we are spilling, update the RA map with the temps added
461 * by the spill setup. Our spill_base register can never be an
462 * accumulator because it is used for TMU spill/fill and thus
463 * needs to persist across thread switches.
464 */
465 if (c->spilling) {
466 int temp_class = CLASS_BITS_PHYS;
467 if (c->devinfo->has_accumulators &&
468 i != c->spill_base.index) {
469 temp_class |= CLASS_BITS_ACC;
470 }
471 int node = add_node(c, i, temp_class);
472 c->nodes.info[node].payload_conflict =
473 stage_has_payload(c);
474 }
475 }
476
477 /* Restore the current block. */
478 c->cur_block = current_block;
479 c->cursor = vir_after_block(c->cur_block);
480 }
481
482 /**
483 * Computes the address for a spill/fill sequence and completes the spill/fill
484 * sequence by emitting the following code:
485 *
486 * ldunif.spill_offset
487 * add tmua spill_base spill_offset
488 * thrsw
489 *
490 * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
491 * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
492 *
493 * The parameter 'ip' represents the ip at which the spill/fill is happening.
494 * This is used to disallow accumulators on temps that cross this ip boundary
495 * due to the new thrsw itroduced in the sequence above.
496 */
497 static void
v3d_emit_spill_tmua(struct v3d_compile * c,uint32_t spill_offset,enum v3d_qpu_cond cond,int32_t ip,struct qreg * fill_dst)498 v3d_emit_spill_tmua(struct v3d_compile *c,
499 uint32_t spill_offset,
500 enum v3d_qpu_cond cond,
501 int32_t ip,
502 struct qreg *fill_dst)
503 {
504 assert(ip >= 0);
505
506 /* Load a uniform with the spill offset and add it to the spill base
507 * to obtain the TMUA address. It can be of class ANY because we know
508 * we are consuming it immediately without thrsw in between.
509 */
510 assert(c->disable_ldunif_opt);
511 struct qreg offset = vir_uniform_ui(c, spill_offset);
512 add_node(c, offset.index, get_class_bit_any(c->devinfo));
513
514 /* We always enable per-quad on spills/fills to ensure we spill
515 * any channels involved with helper invocations, but only if
516 * the spill is not conditional, since otherwise we may be spilling
517 * invalida lanes and overwriting valid data from a previous spill
518 * to the same address.
519 */
520 struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
521 struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
522 inst->qpu.flags.ac = cond;
523 inst->ldtmu_count = 1;
524 inst->uniform =
525 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
526 cond != V3D_QPU_COND_NONE ?
527 0xffffffff : 0xffffff7f /* per-quad*/);
528
529 vir_emit_thrsw(c);
530
531 /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
532 * result of the fill. The TMUWT temp is not really read, the ldtmu
533 * temp will be used immediately so just like the uniform above we
534 * can allow accumulators.
535 */
536 int temp_class =
537 filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
538 if (!fill_dst) {
539 struct qreg dst = vir_TMUWT(c);
540 assert(dst.file == QFILE_TEMP);
541 add_node(c, dst.index, temp_class);
542 } else {
543 *fill_dst = vir_LDTMU(c);
544 assert(fill_dst->file == QFILE_TEMP);
545 add_node(c, fill_dst->index, temp_class);
546 }
547
548 /* Temps across the thread switch we injected can't be assigned to
549 * accumulators.
550 *
551 * Fills inject code before ip, so anything that starts at ip or later
552 * is not affected by the thrsw. Something that ends at ip will be
553 * affected though.
554 *
555 * Spills inject code after ip, so anything that starts strictly later
556 * than ip is not affected (the temp starting at ip is usually the
557 * spilled temp except for postponed spills). Something that ends at ip
558 * won't be affected either.
559 */
560 for (int i = 0; i < c->spill_start_num_temps; i++) {
561 bool thrsw_cross = fill_dst ?
562 c->temp_start[i] < ip && c->temp_end[i] >= ip :
563 c->temp_start[i] <= ip && c->temp_end[i] > ip;
564 if (thrsw_cross) {
565 ra_set_node_class(c->g, temp_to_node(c, i),
566 choose_reg_class(c, CLASS_BITS_PHYS));
567 }
568 }
569 }
570
571 static void
v3d_emit_tmu_spill(struct v3d_compile * c,struct qinst * inst,struct qreg spill_temp,struct qinst * position,uint32_t ip,uint32_t spill_offset)572 v3d_emit_tmu_spill(struct v3d_compile *c,
573 struct qinst *inst,
574 struct qreg spill_temp,
575 struct qinst *position,
576 uint32_t ip,
577 uint32_t spill_offset)
578 {
579 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
580 assert(inst->dst.file == QFILE_TEMP);
581
582 c->cursor = vir_after_inst(position);
583
584 enum v3d_qpu_cond cond = vir_get_cond(inst);
585
586 /* If inst and position don't match, this is a postponed spill,
587 * in which case we have already allocated the temp for the spill
588 * and we should use that, otherwise create a new temp with the
589 * same register class bits as the original.
590 */
591 if (inst == position) {
592 uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
593 inst->dst = vir_get_temp(c);
594 add_node(c, inst->dst.index, class_bits);
595 } else {
596 inst->dst = spill_temp;
597
598 /* If this is a postponed spill the register being spilled may
599 * have been written more than once including conditional
600 * writes, so ignore predication on the spill instruction and
601 * always spill the full register.
602 */
603 cond = V3D_QPU_COND_NONE;
604 }
605
606 struct qinst *tmp =
607 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
608 inst->dst);
609 tmp->qpu.flags.mc = cond;
610
611 v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
612
613 c->spills++;
614 c->tmu_dirty_rcl = true;
615 }
616
617 static inline bool
interferes(int32_t t0_start,int32_t t0_end,int32_t t1_start,int32_t t1_end)618 interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
619 {
620 return !(t0_start >= t1_end || t1_start >= t0_end);
621 }
622
623 static void
v3d_spill_reg(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int spill_temp)624 v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
625 int spill_temp)
626 {
627 c->spill_start_num_temps = c->num_temps;
628 c->spilling = true;
629
630 enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
631
632 uint32_t spill_offset = 0;
633 if (spill_type == SPILL_TYPE_TMU) {
634 spill_offset = c->spill_size;
635 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
636
637 if (spill_offset == 0) {
638 v3d_setup_spill_base(c);
639
640 /* Don't allocate our spill base to rf0 to avoid
641 * conflicts with instructions doing implicit writes
642 * to that register.
643 */
644 if (!c->devinfo->has_accumulators) {
645 ra_add_node_interference(
646 c->g,
647 temp_to_node(c, c->spill_base.index),
648 implicit_rf_nodes[0]);
649 }
650 }
651 }
652
653 struct qinst *last_thrsw = c->last_thrsw;
654 assert(last_thrsw && last_thrsw->is_last_thrsw);
655
656 int uniform_index = ~0;
657 if (spill_type == SPILL_TYPE_UNIFORM) {
658 struct qinst *orig_unif = c->defs[spill_temp];
659 uniform_index = orig_unif->uniform;
660 }
661
662 enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
663 if (spill_type == SPILL_TYPE_RECONSTRUCT) {
664 struct qinst *orig_def = c->defs[spill_temp];
665 assert(vir_is_add(orig_def));
666 reconstruct_op = orig_def->qpu.alu.add.op;
667 }
668
669 uint32_t spill_node = temp_to_node(c, spill_temp);
670
671 /* We must disable the ldunif optimization if we are spilling uniforms */
672 bool had_disable_ldunif_opt = c->disable_ldunif_opt;
673 c->disable_ldunif_opt = true;
674
675 struct qinst *start_of_tmu_sequence = NULL;
676 struct qinst *postponed_spill = NULL;
677 struct qreg postponed_spill_temp = { 0 };
678 vir_for_each_block(block, c) {
679 vir_for_each_inst_safe(inst, block) {
680 int32_t ip = inst->ip;
681
682 /* Track when we're in between a TMU setup and the final
683 * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
684 * temps during that time, because that involves inserting a
685 * new TMU setup/LDTMU sequence, so we postpone the spill or
686 * move the fill up to not intrude in the middle of the TMU
687 * sequence.
688 */
689 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
690 if (postponed_spill) {
691 v3d_emit_tmu_spill(c, postponed_spill,
692 postponed_spill_temp,
693 inst, ip, spill_offset);
694 }
695
696 start_of_tmu_sequence = NULL;
697 postponed_spill = NULL;
698 }
699
700 if (!start_of_tmu_sequence &&
701 qinst_writes_tmu(c->devinfo, inst)) {
702 start_of_tmu_sequence = inst;
703 }
704
705 /* fills */
706 int filled_src = -1;
707 for (int i = 0; i < vir_get_nsrc(inst); i++) {
708 if (inst->src[i].file != QFILE_TEMP ||
709 inst->src[i].index != spill_temp) {
710 continue;
711 }
712
713 if (filled_src >= 0) {
714 inst->src[i] = inst->src[filled_src];
715 continue;
716 }
717
718 c->cursor = vir_before_inst(inst);
719
720 if (spill_type == SPILL_TYPE_UNIFORM) {
721 struct qreg unif =
722 vir_uniform(c,
723 c->uniform_contents[uniform_index],
724 c->uniform_data[uniform_index]);
725 inst->src[i] = unif;
726 /* We are using the uniform in the
727 * instruction immediately after, so
728 * we can use any register class for it.
729 */
730 add_node(c, unif.index,
731 get_class_bit_any(c->devinfo));
732 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
733 struct qreg temp =
734 reconstruct_temp(c, reconstruct_op);
735 inst->src[i] = temp;
736 /* We are using the temp in the
737 * instruction immediately after so we
738 * can use ACC.
739 */
740 int temp_class =
741 filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
742 CLASS_BITS_ACC);
743 add_node(c, temp.index, temp_class);
744 } else {
745 /* If we have a postponed spill, we
746 * don't need a fill as the temp would
747 * not have been spilled yet, however,
748 * we need to update the temp index.
749 */
750 if (postponed_spill) {
751 inst->src[i] =
752 postponed_spill_temp;
753 } else {
754 int32_t fill_ip = ip;
755 if (start_of_tmu_sequence) {
756 c->cursor = vir_before_inst(start_of_tmu_sequence);
757 fill_ip = start_of_tmu_sequence->ip;
758 }
759
760 v3d_emit_spill_tmua(c, spill_offset,
761 V3D_QPU_COND_NONE,
762 fill_ip, &inst->src[i]);
763 c->fills++;
764 }
765 }
766
767 filled_src = i;
768 }
769
770 /* spills */
771 if (inst->dst.file == QFILE_TEMP &&
772 inst->dst.index == spill_temp) {
773 if (spill_type != SPILL_TYPE_TMU) {
774 c->cursor.link = NULL;
775 vir_remove_instruction(c, inst);
776 } else {
777 /* If we are in the middle of a TMU
778 * sequence, we postpone the actual
779 * spill until we have finished it. We,
780 * still need to replace the spill temp
781 * with a new temp though.
782 */
783 if (start_of_tmu_sequence) {
784 if (postponed_spill) {
785 postponed_spill->dst =
786 postponed_spill_temp;
787 }
788 if (!postponed_spill ||
789 vir_get_cond(inst) == V3D_QPU_COND_NONE) {
790 postponed_spill_temp =
791 vir_get_temp(c);
792 add_node(c,
793 postponed_spill_temp.index,
794 c->nodes.info[spill_node].class_bits);
795 }
796 postponed_spill = inst;
797 } else {
798 v3d_emit_tmu_spill(c, inst,
799 postponed_spill_temp,
800 inst, ip,
801 spill_offset);
802 }
803 }
804 }
805 }
806 }
807
808 /* Make sure c->last_thrsw is the actual last thrsw, not just one we
809 * inserted in our most recent unspill.
810 */
811 c->last_thrsw = last_thrsw;
812
813 /* Don't allow spilling of our spilling instructions. There's no way
814 * they can help get things colored.
815 */
816 for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
817 BITSET_CLEAR(c->spillable, i);
818
819 /* Reset interference for spilled node */
820 ra_set_node_spill_cost(c->g, spill_node, 0);
821 ra_reset_node_interference(c->g, spill_node);
822 BITSET_CLEAR(c->spillable, spill_temp);
823
824 /* Rebuild program ips */
825 int32_t ip = 0;
826 vir_for_each_inst_inorder(inst, c)
827 inst->ip = ip++;
828
829 /* Rebuild liveness */
830 vir_calculate_live_intervals(c);
831
832 /* Add interferences for the new spilled temps and update interferences
833 * for c->spill_base (since we may have modified its liveness). Also,
834 * update node priorities based one new liveness data.
835 */
836 uint32_t sb_temp =c->spill_base.index;
837 uint32_t sb_node = temp_to_node(c, sb_temp);
838 for (uint32_t i = 0; i < c->num_temps; i++) {
839 if (c->temp_end[i] == -1)
840 continue;
841
842 uint32_t node_i = temp_to_node(c, i);
843 c->nodes.info[node_i].priority =
844 c->temp_end[i] - c->temp_start[i];
845
846 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
847 j < c->num_temps; j++) {
848 if (interferes(c->temp_start[i], c->temp_end[i],
849 c->temp_start[j], c->temp_end[j])) {
850 uint32_t node_j = temp_to_node(c, j);
851 ra_add_node_interference(c->g, node_i, node_j);
852 }
853 }
854
855 if (spill_type == SPILL_TYPE_TMU) {
856 if (i != sb_temp &&
857 interferes(c->temp_start[i], c->temp_end[i],
858 c->temp_start[sb_temp], c->temp_end[sb_temp])) {
859 ra_add_node_interference(c->g, node_i, sb_node);
860 }
861 }
862 }
863
864 c->disable_ldunif_opt = had_disable_ldunif_opt;
865 c->spilling = false;
866 }
867
868 struct v3d_ra_select_callback_data {
869 uint32_t phys_index;
870 uint32_t next_acc;
871 uint32_t next_phys;
872 struct v3d_ra_node_info *nodes;
873 const struct v3d_device_info *devinfo;
874 };
875
876 /* Choosing accumulators improves chances of merging QPU instructions
877 * due to these merges requiring that at most 2 rf registers are used
878 * by the add and mul instructions.
879 */
880 static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,int priority)881 v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
882 BITSET_WORD *regs,
883 int priority)
884 {
885 if (!v3d_ra->devinfo->has_accumulators)
886 return false;
887
888 /* Favor accumulators if we have less that this number of physical
889 * registers. Accumulators have more restrictions (like being
890 * invalidated through thrsw), so running out of physical registers
891 * even if we have accumulators available can lead to register
892 * allocation failures.
893 */
894 static const int available_rf_threshold = 5;
895 int available_rf = 0 ;
896 for (int i = 0; i < PHYS_COUNT; i++) {
897 if (BITSET_TEST(regs, v3d_ra->phys_index + i))
898 available_rf++;
899 if (available_rf >= available_rf_threshold)
900 break;
901 }
902 if (available_rf < available_rf_threshold)
903 return true;
904
905 /* Favor accumulators for short-lived temps (our priority represents
906 * liveness), to prevent long-lived temps from grabbing accumulators
907 * and preventing follow-up instructions from using them, potentially
908 * leading to large portions of the shader being unable to use
909 * accumulators and therefore merge instructions successfully.
910 */
911 static const int priority_threshold = 20;
912 if (priority <= priority_threshold)
913 return true;
914
915 return false;
916 }
917
918 static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,unsigned int * out)919 v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
920 BITSET_WORD *regs,
921 unsigned int *out)
922 {
923 if (!v3d_ra->devinfo->has_accumulators)
924 return false;
925
926 /* Choose r5 for our ldunifs if possible (nobody else can load to that
927 * reg, and it keeps the QPU cond field free from being occupied by
928 * ldunifrf).
929 */
930 int r5 = ACC_INDEX + 5;
931 if (BITSET_TEST(regs, r5)) {
932 *out = r5;
933 return true;
934 }
935
936 /* Round-robin through our accumulators to give post-RA instruction
937 * selection more options.
938 */
939 for (int i = 0; i < ACC_COUNT; i++) {
940 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
941 int acc = ACC_INDEX + acc_off;
942
943 if (BITSET_TEST(regs, acc)) {
944 v3d_ra->next_acc = acc_off + 1;
945 *out = acc;
946 return true;
947 }
948 }
949
950 return false;
951 }
952
953 static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data * v3d_ra,unsigned int node,BITSET_WORD * regs,unsigned int * out)954 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
955 unsigned int node,
956 BITSET_WORD *regs,
957 unsigned int *out)
958 {
959 /* If this node is for an unused temp, ignore. */
960 if (v3d_ra->nodes->info[node].unused) {
961 *out = 0;
962 return true;
963 }
964
965 /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
966 * so we can avoid turning them into ldunifrf (which uses the
967 * cond field to encode the dst and would prevent merge with
968 * instructions that use cond flags).
969 */
970 if (v3d_ra->nodes->info[node].try_rf0 &&
971 BITSET_TEST(regs, v3d_ra->phys_index)) {
972 assert(v3d_ra->devinfo->ver >= 71);
973 *out = v3d_ra->phys_index;
974 return true;
975 }
976
977 /* The last 3 instructions in a shader can't use some specific registers
978 * (usually early rf registers, depends on v3d version) so try to
979 * avoid allocating these to registers used by the last instructions
980 * in the shader. Do the same for spilling setup instructions that
981 * may conflict with payload registers.
982 */
983 const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
984 if ((v3d_ra->nodes->info[node].is_program_end ||
985 v3d_ra->nodes->info[node].payload_conflict) &&
986 v3d_ra->next_phys < safe_rf_start) {
987 v3d_ra->next_phys = safe_rf_start;
988 }
989
990 for (int i = 0; i < PHYS_COUNT; i++) {
991 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
992
993 /* Try to keep rf0 available for ldunif in 7.x (see above). */
994 if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
995 continue;
996
997 int phys = v3d_ra->phys_index + phys_off;
998
999 if (BITSET_TEST(regs, phys)) {
1000 v3d_ra->next_phys = phys_off + 1;
1001 *out = phys;
1002 return true;
1003 }
1004 }
1005
1006 /* If we couldn't allocate, do try to assign rf0 if it is available. */
1007 if (v3d_ra->devinfo->ver >= 71 &&
1008 BITSET_TEST(regs, v3d_ra->phys_index)) {
1009 v3d_ra->next_phys = 1;
1010 *out = v3d_ra->phys_index;
1011 return true;
1012 }
1013
1014 return false;
1015 }
1016
1017 static unsigned int
v3d_ra_select_callback(unsigned int n,BITSET_WORD * regs,void * data)1018 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
1019 {
1020 struct v3d_ra_select_callback_data *v3d_ra = data;
1021
1022 unsigned int reg;
1023 if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
1024 v3d_ra_select_accum(v3d_ra, regs, ®)) {
1025 return reg;
1026 }
1027
1028 if (v3d_ra_select_rf(v3d_ra, n, regs, ®))
1029 return reg;
1030
1031 /* If we ran out of physical registers try to assign an accumulator
1032 * if we didn't favor that option earlier.
1033 */
1034 if (v3d_ra_select_accum(v3d_ra, regs, ®))
1035 return reg;
1036
1037 unreachable("RA must pass us at least one possible reg.");
1038 }
1039
1040 bool
vir_init_reg_sets(struct v3d_compiler * compiler)1041 vir_init_reg_sets(struct v3d_compiler *compiler)
1042 {
1043 /* Allocate up to 3 regfile classes, for the ways the physical
1044 * register file can be divided up for fragment shader threading.
1045 */
1046 int max_thread_index = 2;
1047 uint8_t phys_index = get_phys_index(compiler->devinfo);
1048
1049 compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
1050 false);
1051 if (!compiler->regs)
1052 return false;
1053
1054 for (int threads = 0; threads < max_thread_index; threads++) {
1055 compiler->reg_class_any[threads] =
1056 ra_alloc_contig_reg_class(compiler->regs, 1);
1057 if (compiler->devinfo->has_accumulators) {
1058 compiler->reg_class_r5[threads] =
1059 ra_alloc_contig_reg_class(compiler->regs, 1);
1060 compiler->reg_class_phys_or_acc[threads] =
1061 ra_alloc_contig_reg_class(compiler->regs, 1);
1062 }
1063 compiler->reg_class_phys[threads] =
1064 ra_alloc_contig_reg_class(compiler->regs, 1);
1065
1066 /* Init physical regs */
1067 for (int i = phys_index;
1068 i < phys_index + (PHYS_COUNT >> threads); i++) {
1069 if (compiler->devinfo->has_accumulators)
1070 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1071 ra_class_add_reg(compiler->reg_class_phys[threads], i);
1072 ra_class_add_reg(compiler->reg_class_any[threads], i);
1073 }
1074
1075 /* Init accumulator regs */
1076 if (compiler->devinfo->has_accumulators) {
1077 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
1078 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1079 ra_class_add_reg(compiler->reg_class_any[threads], i);
1080 }
1081 /* r5 can only store a single 32-bit value, so not much can
1082 * use it.
1083 */
1084 ra_class_add_reg(compiler->reg_class_r5[threads],
1085 ACC_INDEX + 5);
1086 ra_class_add_reg(compiler->reg_class_any[threads],
1087 ACC_INDEX + 5);
1088 }
1089 }
1090
1091 ra_set_finalize(compiler->regs, NULL);
1092
1093 return true;
1094 }
1095
1096 static inline bool
tmu_spilling_allowed(struct v3d_compile * c)1097 tmu_spilling_allowed(struct v3d_compile *c)
1098 {
1099 return c->spills + c->fills < c->max_tmu_spills;
1100 }
1101
1102 static bool
reg_is_payload(struct v3d_compile * c,struct qreg reg)1103 reg_is_payload(struct v3d_compile *c, struct qreg reg)
1104 {
1105 if (reg.file != QFILE_REG)
1106 return false;
1107
1108 if (c->devinfo->ver >= 71) {
1109 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1110 return reg.index >= 1 && reg.index <= 3;
1111 if (c->s->info.stage == MESA_SHADER_COMPUTE)
1112 return reg.index == 2 || reg.index == 3;
1113 return false;
1114 }
1115
1116 assert(c->devinfo->ver == 42);
1117 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1118 return reg.index <= 2;
1119 if (c->s->info.stage == MESA_SHADER_COMPUTE)
1120 return reg.index == 0 || reg.index == 2;
1121 return false;
1122 }
1123
1124 static bool
inst_reads_payload(struct v3d_compile * c,struct qinst * inst)1125 inst_reads_payload(struct v3d_compile *c, struct qinst *inst)
1126 {
1127 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1128 return false;
1129
1130 if (reg_is_payload(c, inst->dst))
1131 return true;
1132
1133 if (reg_is_payload(c, inst->src[0]))
1134 return true;
1135
1136 if (vir_get_nsrc(inst) > 1 && reg_is_payload(c, inst->src[1]))
1137 return true;
1138
1139 return false;
1140 }
1141
1142 static void
update_graph_and_reg_classes_for_inst(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int last_ldvary_ip,bool has_payload,struct qinst * inst)1143 update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
1144 int *acc_nodes,
1145 int *implicit_rf_nodes,
1146 int last_ldvary_ip,
1147 bool has_payload,
1148 struct qinst *inst)
1149 {
1150 int32_t ip = inst->ip;
1151 assert(ip >= 0);
1152
1153 /* If the instruction writes r4 (and optionally moves its
1154 * result to a temp), nothing else can be stored in r4 across
1155 * it.
1156 */
1157 if (vir_writes_r4_implicitly(c->devinfo, inst)) {
1158 for (int i = 0; i < c->num_temps; i++) {
1159 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1160 ra_add_node_interference(c->g,
1161 temp_to_node(c, i),
1162 acc_nodes[4]);
1163 }
1164 }
1165 }
1166
1167 /* If any instruction writes to a physical register implicitly
1168 * nothing else can write the same register across it.
1169 */
1170 if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
1171 for (int i = 0; i < c->num_temps; i++) {
1172 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1173 ra_add_node_interference(c->g,
1174 temp_to_node(c, i),
1175 implicit_rf_nodes[0]);
1176 }
1177 }
1178 }
1179
1180 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
1181 switch (inst->qpu.alu.add.op) {
1182 case V3D_QPU_A_LDVPMV_IN:
1183 case V3D_QPU_A_LDVPMV_OUT:
1184 case V3D_QPU_A_LDVPMD_IN:
1185 case V3D_QPU_A_LDVPMD_OUT:
1186 case V3D_QPU_A_LDVPMP:
1187 case V3D_QPU_A_LDVPMG_IN:
1188 case V3D_QPU_A_LDVPMG_OUT: {
1189 /* LDVPMs only store to temps (the MA flag
1190 * decides whether the LDVPM is in or out)
1191 */
1192 assert(inst->dst.file == QFILE_TEMP);
1193 set_temp_class_bits(c, inst->dst.index,
1194 CLASS_BITS_PHYS);
1195 break;
1196 }
1197
1198 case V3D_QPU_A_RECIP:
1199 case V3D_QPU_A_RSQRT:
1200 case V3D_QPU_A_EXP:
1201 case V3D_QPU_A_LOG:
1202 case V3D_QPU_A_SIN:
1203 case V3D_QPU_A_RSQRT2: {
1204 /* The SFU instructions write directly to the
1205 * phys regfile.
1206 */
1207 assert(inst->dst.file == QFILE_TEMP);
1208 set_temp_class_bits(c, inst->dst.index,
1209 CLASS_BITS_PHYS);
1210 break;
1211 }
1212
1213 default:
1214 break;
1215 }
1216 }
1217
1218 if (inst->src[0].file == QFILE_REG) {
1219 switch (inst->src[0].index) {
1220 case 0:
1221 /* V3D 7.x doesn't use rf0 for thread payload */
1222 if (c->devinfo->ver >= 71)
1223 break;
1224 else
1225 FALLTHROUGH;
1226 case 1:
1227 case 2:
1228 case 3: {
1229 /* Payload setup instructions: Force allocate
1230 * the dst to the given register (so the MOV
1231 * will disappear).
1232 */
1233 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
1234 assert(inst->dst.file == QFILE_TEMP);
1235 uint32_t node = temp_to_node(c, inst->dst.index);
1236 ra_set_node_reg(c->g, node,
1237 get_phys_index(c->devinfo) +
1238 inst->src[0].index);
1239 break;
1240 }
1241 }
1242 }
1243
1244 /* Don't allocate rf0 to temps that cross ranges where we have
1245 * live implicit rf0 writes from ldvary. We can identify these
1246 * by tracking the last ldvary instruction and explicit reads
1247 * of rf0.
1248 */
1249 if (c->devinfo->ver >= 71 &&
1250 ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
1251 (vir_get_nsrc(inst) > 1 &&
1252 inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
1253 for (int i = 0; i < c->num_temps; i++) {
1254 if (c->temp_start[i] < ip &&
1255 c->temp_end[i] > last_ldvary_ip) {
1256 ra_add_node_interference(c->g,
1257 temp_to_node(c, i),
1258 implicit_rf_nodes[0]);
1259 }
1260 }
1261 }
1262
1263 /* Spill setup instructions are the only ones that we emit before
1264 * reading payload registers so we want to flag their temps so we
1265 * don't assign them to payload registers and stomp them before we
1266 * can read them. For the case where we may have emitted spill setup
1267 * before RA (i.e. for scratch), we need to do this now.
1268 */
1269 if (c->spill_size > 0 && has_payload && inst_reads_payload(c, inst)) {
1270 struct qblock *first_block = vir_entry_block(c);
1271 list_for_each_entry_from_rev(struct qinst, _i, inst->link.prev,
1272 &first_block->instructions, link) {
1273 if (_i->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1274 continue;
1275 if (_i->dst.file == QFILE_TEMP) {
1276 int node = temp_to_node(c, _i->dst.index);
1277 c->nodes.info[node].payload_conflict = true;
1278 }
1279 if (_i->src[0].file == QFILE_TEMP) {
1280 int node = temp_to_node(c, _i->src[0].index);
1281 c->nodes.info[node].payload_conflict = true;
1282 }
1283 if (vir_get_nsrc(_i) > 1 && _i->src[1].file == QFILE_TEMP) {
1284 int node = temp_to_node(c, _i->src[1].index);
1285 c->nodes.info[node].payload_conflict = true;
1286 }
1287 }
1288 }
1289
1290 if (inst->dst.file == QFILE_TEMP) {
1291 /* Only a ldunif gets to write to R5, which only has a single
1292 * 32-bit channel of storage. Disallow R5 if we are around
1293 * ldvary sequences, since ldvary writes that register too and
1294 * that would disallow pairing.
1295 *
1296 * NOTE: ldunifa is subject to the same, however, going by
1297 * shader-db it is best to keep r5 exclusive to ldunif, probably
1298 * because ldunif has usually a shorter lifespan, allowing for
1299 * more accumulator reuse and QPU merges.
1300 */
1301 if (c->devinfo->has_accumulators) {
1302 if (!inst->qpu.sig.ldunif ||
1303 (c->s->info.stage == MESA_SHADER_FRAGMENT &&
1304 ip <= last_ldvary_ip + 4)) {
1305 uint8_t class_bits =
1306 get_temp_class_bits(c, inst->dst.index) &
1307 ~CLASS_BITS_R5;
1308 set_temp_class_bits(c, inst->dst.index,
1309 class_bits);
1310
1311 }
1312 } else {
1313 /* Make sure we don't allocate the ldvary's
1314 * destination to rf0, since it would clash
1315 * with its implicit write to that register.
1316 */
1317 if (inst->qpu.sig.ldvary) {
1318 ra_add_node_interference(c->g,
1319 temp_to_node(c, inst->dst.index),
1320 implicit_rf_nodes[0]);
1321 }
1322 /* Flag dst temps from ldunif(a) instructions so we can
1323 * try to assign rf0 to them and avoid converting these
1324 * to ldunif(a)rf, however, we don't want to do this
1325 * when these instructions are nearby ldvary since these
1326 * have implicit writes to rf0 and that would hurt
1327 * pairing.
1328 */
1329 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) &&
1330 (c->s->info.stage != MESA_SHADER_FRAGMENT ||
1331 ip > last_ldvary_ip + 4)) {
1332 const uint32_t dst_n =
1333 temp_to_node(c, inst->dst.index);
1334 c->nodes.info[dst_n].try_rf0 = true;
1335 }
1336 }
1337 }
1338
1339 /* All accumulators are invalidated across a thread switch. */
1340 if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
1341 for (int i = 0; i < c->num_temps; i++) {
1342 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1343 set_temp_class_bits(c, i,
1344 CLASS_BITS_PHYS);
1345 }
1346 }
1347 }
1348 }
1349
1350 static void
flag_program_end_nodes(struct v3d_compile * c)1351 flag_program_end_nodes(struct v3d_compile *c)
1352 {
1353 /* Only look for registers used in this many instructions */
1354 uint32_t last_set_count = 6;
1355
1356 struct qblock *last_block = vir_exit_block(c);
1357 list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
1358 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1359 continue;
1360
1361 int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
1362 for (int i = 0; i < num_src; i++) {
1363 if (inst->src[i].file == QFILE_TEMP) {
1364 int node = temp_to_node(c, inst->src[i].index);
1365 c->nodes.info[node].is_program_end = true;
1366 }
1367 }
1368
1369 num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
1370 for (int i = 0; i < num_src; i++) {
1371 if (inst->src[i].file == QFILE_TEMP) {
1372 int node = temp_to_node(c, inst->src[i].index);
1373 c->nodes.info[node].is_program_end = true;
1374
1375 }
1376 }
1377
1378 if (inst->dst.file == QFILE_TEMP) {
1379 int node = temp_to_node(c, inst->dst.index);
1380 c->nodes.info[node].is_program_end = true;
1381 }
1382
1383 if (--last_set_count == 0)
1384 break;
1385 }
1386 }
1387
1388 /**
1389 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
1390 *
1391 * The return value should be freed by the caller.
1392 */
1393 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c)1394 v3d_register_allocate(struct v3d_compile *c)
1395 {
1396 int acc_nodes[ACC_COUNT];
1397 int implicit_rf_nodes[IMPLICIT_RF_COUNT];
1398
1399 unsigned num_ra_nodes = c->num_temps;
1400 if (c->devinfo->has_accumulators)
1401 num_ra_nodes += ARRAY_SIZE(acc_nodes);
1402 else
1403 num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
1404
1405 c->nodes = (struct v3d_ra_node_info) {
1406 .alloc_count = c->num_temps,
1407 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
1408 num_ra_nodes),
1409 };
1410
1411 uint32_t phys_index = get_phys_index(c->devinfo);
1412
1413 struct v3d_ra_select_callback_data callback_data = {
1414 .phys_index = phys_index,
1415 .next_acc = 0,
1416 /* Start at RF3, to try to keep the TLB writes from using
1417 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
1418 * using RF2-3.
1419 */
1420 .next_phys = c->devinfo->ver == 42 ? 3 : 4,
1421 .nodes = &c->nodes,
1422 .devinfo = c->devinfo,
1423 };
1424
1425 vir_calculate_live_intervals(c);
1426
1427 /* Convert 1, 2, 4 threads to 0, 1, 2 index.
1428 *
1429 * V3D 4.x has double the physical register space, so 64 physical regs
1430 * are available at both 1x and 2x threading, and 4x has 32.
1431 */
1432 c->thread_index = ffs(c->threads) - 1;
1433 if (c->thread_index >= 1)
1434 c->thread_index--;
1435
1436 c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
1437 ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
1438
1439 /* Make some fixed nodes for the accumulators, which we will need to
1440 * interfere with when ops have implied r3/r4 writes or for the thread
1441 * switches. We could represent these as classes for the nodes to
1442 * live in, but the classes take up a lot of memory to set up, so we
1443 * don't want to make too many. We use the same mechanism on platforms
1444 * without accumulators that can have implicit writes to phys regs.
1445 */
1446 for (uint32_t i = 0; i < num_ra_nodes; i++) {
1447 c->nodes.info[i].try_rf0 = false;
1448 c->nodes.info[i].is_program_end = false;
1449 c->nodes.info[i].unused = false;
1450 c->nodes.info[i].priority = 0;
1451 c->nodes.info[i].class_bits = 0;
1452 c->nodes.info[i].payload_conflict = false;
1453 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
1454 acc_nodes[i] = i;
1455 ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
1456 } else if (!c->devinfo->has_accumulators &&
1457 i < ARRAY_SIZE(implicit_rf_nodes)) {
1458 implicit_rf_nodes[i] = i;
1459 ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
1460 } else {
1461 uint32_t t = node_to_temp(c, i);
1462 c->nodes.info[i].priority =
1463 c->temp_end[t] - c->temp_start[t];
1464 c->nodes.info[i].class_bits =
1465 get_class_bit_any(c->devinfo);
1466 }
1467 }
1468
1469 /* Walk the instructions adding register class restrictions and
1470 * interferences.
1471 */
1472 int ip = 0;
1473 int last_ldvary_ip = -1;
1474 bool has_payload = stage_has_payload(c);
1475 vir_for_each_inst_inorder(inst, c) {
1476 inst->ip = ip++;
1477
1478 /* ldunif(a) always write to a temporary, so we have
1479 * liveness info available to decide if rf0 is
1480 * available for them, however, ldvary is different:
1481 * it always writes to rf0 directly so we don't have
1482 * liveness information for its implicit rf0 write.
1483 *
1484 * That means the allocator may assign rf0 to a temp
1485 * that is defined while an implicit rf0 write from
1486 * ldvary is still live. We fix that by manually
1487 * tracking rf0 live ranges from ldvary instructions.
1488 */
1489 if (inst->qpu.sig.ldvary)
1490 last_ldvary_ip = ip;
1491
1492 update_graph_and_reg_classes_for_inst(c, acc_nodes,
1493 implicit_rf_nodes,
1494 last_ldvary_ip,
1495 has_payload,
1496 inst);
1497 }
1498
1499 /* Flag the nodes that are used in the last instructions of the program
1500 * (there are some registers that cannot be used in the last 3
1501 * instructions). We only do this for fragment shaders, because the idea
1502 * is that by avoiding this conflict we may be able to emit the last
1503 * thread switch earlier in some cases, however, in non-fragment shaders
1504 * this won't happen because the last instructions are always VPM stores
1505 * with a small immediate, which conflicts with other signals,
1506 * preventing us from ever moving the thrsw earlier.
1507 */
1508 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1509 flag_program_end_nodes(c);
1510
1511 /* Set the register classes for all our temporaries in the graph */
1512 for (uint32_t i = 0; i < c->num_temps; i++) {
1513 ra_set_node_class(c->g, temp_to_node(c, i),
1514 choose_reg_class_for_temp(c, i));
1515 }
1516
1517 /* Add register interferences based on liveness data */
1518 for (uint32_t i = 0; i < c->num_temps; i++) {
1519 /* And while we are here, let's also flag nodes for
1520 * unused temps.
1521 */
1522 if (c->temp_start[i] > c->temp_end[i])
1523 c->nodes.info[temp_to_node(c, i)].unused = true;
1524
1525 for (uint32_t j = i + 1; j < c->num_temps; j++) {
1526 if (interferes(c->temp_start[i], c->temp_end[i],
1527 c->temp_start[j], c->temp_end[j])) {
1528 ra_add_node_interference(c->g,
1529 temp_to_node(c, i),
1530 temp_to_node(c, j));
1531 }
1532 }
1533 }
1534
1535 /* Debug option to force a bit of TMU spilling, for running
1536 * across conformance tests to make sure that spilling works.
1537 */
1538 const int force_register_spills = 0;
1539 if (force_register_spills > 0)
1540 c->max_tmu_spills = UINT32_MAX;
1541
1542 struct qpu_reg *temp_registers = NULL;
1543 while (true) {
1544 if (c->spill_size <
1545 V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
1546 int node = v3d_choose_spill_node(c);
1547 uint32_t temp = node_to_temp(c, node);
1548 if (node != -1) {
1549 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1550 continue;
1551 }
1552 }
1553
1554 if (ra_allocate(c->g))
1555 break;
1556
1557 /* Failed allocation, try to spill */
1558 int node = v3d_choose_spill_node(c);
1559 if (node == -1)
1560 goto spill_fail;
1561
1562 uint32_t temp = node_to_temp(c, node);
1563 enum temp_spill_type spill_type =
1564 get_spill_type_for_temp(c, temp);
1565 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
1566 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1567 if (c->spills + c->fills > c->max_tmu_spills)
1568 goto spill_fail;
1569 } else {
1570 goto spill_fail;
1571 }
1572 }
1573
1574 /* Allocation was successful, build the 'temp -> reg' map */
1575 temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
1576 for (uint32_t i = 0; i < c->num_temps; i++) {
1577 int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
1578 if (ra_reg < phys_index) {
1579 temp_registers[i].magic = true;
1580 temp_registers[i].index = (V3D_QPU_WADDR_R0 +
1581 ra_reg - ACC_INDEX);
1582 } else {
1583 temp_registers[i].magic = false;
1584 temp_registers[i].index = ra_reg - phys_index;
1585 }
1586 }
1587
1588 spill_fail:
1589 ralloc_free(c->nodes.info);
1590 c->nodes.info = NULL;
1591 c->nodes.alloc_count = 0;
1592 ralloc_free(c->g);
1593 c->g = NULL;
1594 return temp_registers;
1595 }
1596