• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28 
29 #define QPU_R(i) { .magic = false, .index = i }
30 
31 #define ACC_INDEX     0
32 #define ACC_COUNT     6
33 #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
34 #define PHYS_COUNT    64
35 
36 static inline bool
qinst_writes_tmu(struct qinst * inst)37 qinst_writes_tmu(struct qinst *inst)
38 {
39         return (inst->dst.file == QFILE_MAGIC &&
40                 v3d_qpu_magic_waddr_is_tmu(inst->dst.index)) ||
41                 inst->qpu.sig.wrtmuc;
42 }
43 
44 static bool
is_end_of_tmu_sequence(struct qinst * inst,struct qblock * block)45 is_end_of_tmu_sequence(struct qinst *inst, struct qblock *block)
46 {
47         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
48             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
49                 return true;
50 
51         if (!inst->qpu.sig.ldtmu)
52                 return false;
53 
54         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
55                                  &block->instructions, link) {
56                 if (scan_inst->qpu.sig.ldtmu)
57                         return false;
58                 if (qinst_writes_tmu(scan_inst))
59                         return true;
60         }
61 
62         return true;
63 }
64 
65 static bool
vir_is_mov_uniform(struct v3d_compile * c,int temp)66 vir_is_mov_uniform(struct v3d_compile *c, int temp)
67 {
68         struct qinst *def = c->defs[temp];
69 
70         return def && def->qpu.sig.ldunif;
71 }
72 
73 static int
v3d_choose_spill_node(struct v3d_compile * c,struct ra_graph * g,uint32_t * temp_to_node)74 v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
75                       uint32_t *temp_to_node)
76 {
77         const float tmu_scale = 5;
78         float block_scale = 1.0;
79         float spill_costs[c->num_temps];
80         bool in_tmu_operation = false;
81         bool started_last_seg = false;
82 
83         for (unsigned i = 0; i < c->num_temps; i++)
84                 spill_costs[i] = 0.0;
85 
86         /* XXX: Scale the cost up when inside of a loop. */
87         vir_for_each_block(block, c) {
88                 vir_for_each_inst(inst, block) {
89                         /* We can't insert new thread switches after
90                          * starting output writes.
91                          */
92                         bool no_spilling =
93                                 c->threads > 1 && started_last_seg;
94 
95                         /* Discourage spilling of TMU operations */
96                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
97                                 if (inst->src[i].file != QFILE_TEMP)
98                                         continue;
99 
100                                 int temp = inst->src[i].index;
101                                 if (vir_is_mov_uniform(c, temp)) {
102                                         spill_costs[temp] += block_scale;
103                                 } else if (!no_spilling) {
104                                         float tmu_op_scale = in_tmu_operation ?
105                                                 3.0 : 1.0;
106                                         spill_costs[temp] += (block_scale *
107                                                               tmu_scale *
108                                                               tmu_op_scale);
109                                 } else {
110                                         BITSET_CLEAR(c->spillable, temp);
111                                 }
112                         }
113 
114                         if (inst->dst.file == QFILE_TEMP) {
115                                 int temp = inst->dst.index;
116 
117                                 if (vir_is_mov_uniform(c, temp)) {
118                                         /* We just rematerialize the unform
119                                          * later.
120                                          */
121                                 } else if (!no_spilling) {
122                                         spill_costs[temp] += (block_scale *
123                                                               tmu_scale);
124                                 } else {
125                                         BITSET_CLEAR(c->spillable, temp);
126                                 }
127                         }
128 
129                         /* Refuse to spill a ldvary's dst, because that means
130                          * that ldvary's r5 would end up being used across a
131                          * thrsw.
132                          */
133                         if (inst->qpu.sig.ldvary) {
134                                 assert(inst->dst.file == QFILE_TEMP);
135                                 BITSET_CLEAR(c->spillable, inst->dst.index);
136                         }
137 
138                         if (inst->is_last_thrsw)
139                                 started_last_seg = true;
140 
141                         if (v3d_qpu_writes_vpm(&inst->qpu) ||
142                             v3d_qpu_uses_tlb(&inst->qpu))
143                                 started_last_seg = true;
144 
145                         /* Track when we're in between a TMU setup and the
146                          * final LDTMU or TMUWT from that TMU setup.  We
147                          * penalize spills during that time.
148                          */
149                         if (is_end_of_tmu_sequence(inst, block))
150                                 in_tmu_operation = false;
151 
152                         if (qinst_writes_tmu(inst))
153                                 in_tmu_operation = true;
154                 }
155         }
156 
157         for (unsigned i = 0; i < c->num_temps; i++) {
158                 int node = temp_to_node[i];
159 
160                 if (BITSET_TEST(c->spillable, i))
161                         ra_set_node_spill_cost(g, node, spill_costs[i]);
162         }
163 
164         return ra_get_best_spill_node(g);
165 }
166 
167 /* The spill offset for this thread takes a bit of setup, so do it once at
168  * program start.
169  */
170 void
v3d_setup_spill_base(struct v3d_compile * c)171 v3d_setup_spill_base(struct v3d_compile *c)
172 {
173         c->cursor = vir_before_block(vir_entry_block(c));
174 
175         int start_num_temps = c->num_temps;
176 
177         /* Each thread wants to be in a separate region of the scratch space
178          * so that the QPUs aren't fighting over cache lines.  We have the
179          * driver keep a single global spill BO rather than
180          * per-spilling-program BOs, so we need a uniform from the driver for
181          * what the per-thread scale is.
182          */
183         struct qreg thread_offset =
184                 vir_UMUL(c,
185                          vir_TIDX(c),
186                          vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
187 
188         /* Each channel in a reg is 4 bytes, so scale them up by that. */
189         struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
190                                              vir_uniform_ui(c, 2));
191 
192         c->spill_base = vir_ADD(c,
193                                 vir_ADD(c, thread_offset, element_offset),
194                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
195 
196         /* Make sure that we don't spill the spilling setup instructions. */
197         for (int i = start_num_temps; i < c->num_temps; i++)
198                 BITSET_CLEAR(c->spillable, i);
199 
200         c->cursor = vir_after_block(c->cur_block);
201 }
202 
203 static void
v3d_emit_spill_tmua(struct v3d_compile * c,uint32_t spill_offset)204 v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
205 {
206         vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
207                                 V3D_QPU_WADDR_TMUA),
208                      c->spill_base,
209                      vir_uniform_ui(c, spill_offset));
210 }
211 
212 
213 static void
v3d_emit_tmu_spill(struct v3d_compile * c,struct qinst * inst,struct qinst * position,uint32_t spill_offset)214 v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
215                    struct qinst *position, uint32_t spill_offset)
216 {
217         c->cursor = vir_after_inst(position);
218         inst->dst.index = c->num_temps++;
219         vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
220                                 V3D_QPU_WADDR_TMUD),
221                      inst->dst);
222         v3d_emit_spill_tmua(c, spill_offset);
223         vir_emit_thrsw(c);
224         vir_TMUWT(c);
225         c->spills++;
226         c->tmu_dirty_rcl = true;
227 }
228 
229 static void
v3d_spill_reg(struct v3d_compile * c,int spill_temp)230 v3d_spill_reg(struct v3d_compile *c, int spill_temp)
231 {
232         c->spill_count++;
233 
234         bool is_uniform = vir_is_mov_uniform(c, spill_temp);
235 
236         uint32_t spill_offset = 0;
237 
238         if (!is_uniform) {
239                 spill_offset = c->spill_size;
240                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
241 
242                 if (spill_offset == 0)
243                         v3d_setup_spill_base(c);
244         }
245 
246         struct qinst *last_thrsw = c->last_thrsw;
247         assert(!last_thrsw || last_thrsw->is_last_thrsw);
248 
249         int start_num_temps = c->num_temps;
250 
251         int uniform_index = ~0;
252         if (is_uniform) {
253                 struct qinst *orig_unif = c->defs[spill_temp];
254                 uniform_index = orig_unif->uniform;
255         }
256 
257         struct qinst *start_of_tmu_sequence = NULL;
258         struct qinst *postponed_spill = NULL;
259         vir_for_each_block(block, c) {
260                 vir_for_each_inst_safe(inst, block) {
261                         /* Track when we're in between a TMU setup and the final
262                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
263                          * temps during that time, because that involves inserting a
264                          * new TMU setup/LDTMU sequence, so we postpone the spill or
265                          * move the fill up to not intrude in the middle of the TMU
266                          * sequence.
267                          */
268                         if (is_end_of_tmu_sequence(inst, block)) {
269                                 if (postponed_spill) {
270                                         v3d_emit_tmu_spill(c, postponed_spill,
271                                                            inst, spill_offset);
272                                 }
273 
274                                 start_of_tmu_sequence = NULL;
275                                 postponed_spill = NULL;
276                         }
277 
278                         if (!start_of_tmu_sequence && qinst_writes_tmu(inst))
279                                 start_of_tmu_sequence = inst;
280 
281                         /* fills */
282                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
283                                 if (inst->src[i].file != QFILE_TEMP ||
284                                     inst->src[i].index != spill_temp) {
285                                         continue;
286                                 }
287 
288                                 c->cursor = vir_before_inst(inst);
289 
290                                 if (is_uniform) {
291                                         struct qreg unif =
292                                                 vir_uniform(c,
293                                                             c->uniform_contents[uniform_index],
294                                                             c->uniform_data[uniform_index]);
295                                         inst->src[i] = unif;
296                                 } else {
297                                         /* If we have a postponed spill, we don't need
298                                          * a fill as the temp would not have been
299                                          * spilled yet.
300                                          */
301                                         if (postponed_spill)
302                                                 continue;
303                                         if (start_of_tmu_sequence)
304                                                 c->cursor = vir_before_inst(start_of_tmu_sequence);
305 
306                                         v3d_emit_spill_tmua(c, spill_offset);
307                                         vir_emit_thrsw(c);
308                                         inst->src[i] = vir_LDTMU(c);
309                                         c->fills++;
310                                 }
311                         }
312 
313                         /* spills */
314                         if (inst->dst.file == QFILE_TEMP &&
315                             inst->dst.index == spill_temp) {
316                                 if (is_uniform) {
317                                         c->cursor.link = NULL;
318                                         vir_remove_instruction(c, inst);
319                                 } else {
320                                         if (start_of_tmu_sequence)
321                                                 postponed_spill = inst;
322                                         else
323                                                 v3d_emit_tmu_spill(c, inst, inst,
324                                                                    spill_offset);
325                                 }
326                         }
327 
328                         /* If we didn't have a last-thrsw inserted by nir_to_vir and
329                          * we've been inserting thrsws, then insert a new last_thrsw
330                          * right before we start the vpm/tlb sequence for the last
331                          * thread segment.
332                          */
333                         if (!is_uniform && !last_thrsw && c->last_thrsw &&
334                             (v3d_qpu_writes_vpm(&inst->qpu) ||
335                              v3d_qpu_uses_tlb(&inst->qpu))) {
336                                 c->cursor = vir_before_inst(inst);
337                                 vir_emit_thrsw(c);
338 
339                                 last_thrsw = c->last_thrsw;
340                                 last_thrsw->is_last_thrsw = true;
341                         }
342                 }
343         }
344 
345         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
346          * inserted in our most recent unspill.
347          */
348         if (last_thrsw)
349                 c->last_thrsw = last_thrsw;
350 
351         /* Don't allow spilling of our spilling instructions.  There's no way
352          * they can help get things colored.
353          */
354         for (int i = start_num_temps; i < c->num_temps; i++)
355                 BITSET_CLEAR(c->spillable, i);
356 }
357 
358 struct v3d_ra_select_callback_data {
359         uint32_t next_acc;
360         uint32_t next_phys;
361 };
362 
363 static unsigned int
v3d_ra_select_callback(unsigned int n,BITSET_WORD * regs,void * data)364 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
365 {
366         struct v3d_ra_select_callback_data *v3d_ra = data;
367         int r5 = ACC_INDEX + 5;
368 
369         /* Choose r5 for our ldunifs if possible (nobody else can load to that
370          * reg, and it keeps the QPU cond field free from being occupied by
371          * ldunifrf).
372          */
373         if (BITSET_TEST(regs, r5))
374                 return r5;
375 
376         /* Choose an accumulator if possible (I think it's lower power than
377          * phys regs), but round-robin through them to give post-RA
378          * instruction selection more options.
379          */
380         for (int i = 0; i < ACC_COUNT; i++) {
381                 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
382                 int acc = ACC_INDEX + acc_off;
383 
384                 if (BITSET_TEST(regs, acc)) {
385                         v3d_ra->next_acc = acc_off + 1;
386                         return acc;
387                 }
388         }
389 
390         for (int i = 0; i < PHYS_COUNT; i++) {
391                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
392                 int phys = PHYS_INDEX + phys_off;
393 
394                 if (BITSET_TEST(regs, phys)) {
395                         v3d_ra->next_phys = phys_off + 1;
396                         return phys;
397                 }
398         }
399 
400         unreachable("RA must pass us at least one possible reg.");
401 }
402 
403 bool
vir_init_reg_sets(struct v3d_compiler * compiler)404 vir_init_reg_sets(struct v3d_compiler *compiler)
405 {
406         /* Allocate up to 3 regfile classes, for the ways the physical
407          * register file can be divided up for fragment shader threading.
408          */
409         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
410 
411         compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
412                                           true);
413         if (!compiler->regs)
414                 return false;
415 
416         for (int threads = 0; threads < max_thread_index; threads++) {
417                 compiler->reg_class_any[threads] =
418                         ra_alloc_reg_class(compiler->regs);
419                 compiler->reg_class_r5[threads] =
420                         ra_alloc_reg_class(compiler->regs);
421                 compiler->reg_class_phys_or_acc[threads] =
422                         ra_alloc_reg_class(compiler->regs);
423                 compiler->reg_class_phys[threads] =
424                         ra_alloc_reg_class(compiler->regs);
425 
426                 for (int i = PHYS_INDEX;
427                      i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
428                         ra_class_add_reg(compiler->regs,
429                                          compiler->reg_class_phys_or_acc[threads], i);
430                         ra_class_add_reg(compiler->regs,
431                                          compiler->reg_class_phys[threads], i);
432                         ra_class_add_reg(compiler->regs,
433                                          compiler->reg_class_any[threads], i);
434                 }
435 
436                 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
437                         ra_class_add_reg(compiler->regs,
438                                          compiler->reg_class_phys_or_acc[threads], i);
439                         ra_class_add_reg(compiler->regs,
440                                          compiler->reg_class_any[threads], i);
441                 }
442                 /* r5 can only store a single 32-bit value, so not much can
443                  * use it.
444                  */
445                 ra_class_add_reg(compiler->regs,
446                                  compiler->reg_class_r5[threads],
447                                  ACC_INDEX + 5);
448                 ra_class_add_reg(compiler->regs,
449                                  compiler->reg_class_any[threads],
450                                  ACC_INDEX + 5);
451         }
452 
453         ra_set_finalize(compiler->regs, NULL);
454 
455         return true;
456 }
457 
458 struct node_to_temp_map {
459         uint32_t temp;
460         uint32_t priority;
461 };
462 
463 static int
node_to_temp_priority(const void * in_a,const void * in_b)464 node_to_temp_priority(const void *in_a, const void *in_b)
465 {
466         const struct node_to_temp_map *a = in_a;
467         const struct node_to_temp_map *b = in_b;
468 
469         return a->priority - b->priority;
470 }
471 
472 /**
473  * Computes the number of registers to spill in a batch after a register
474  * allocation failure.
475  */
476 static uint32_t
get_spill_batch_size(struct v3d_compile * c)477 get_spill_batch_size(struct v3d_compile *c)
478 {
479    /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
480     * over-spilling if the program requires few spills to compile.
481     */
482    if (c->spill_count < 10)
483            return 1;
484 
485    /* If we have to spill more than that we assume performance is not going to
486     * be great and we shift focus to batching spills to cut down compile
487     * time at the expense of over-spilling.
488     */
489    return 20;
490 }
491 
492 #define CLASS_BIT_PHYS			(1 << 0)
493 #define CLASS_BIT_ACC			(1 << 1)
494 #define CLASS_BIT_R5			(1 << 4)
495 #define CLASS_BITS_ANY			(CLASS_BIT_PHYS | \
496                                          CLASS_BIT_ACC | \
497                                          CLASS_BIT_R5)
498 
499 /**
500  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
501  *
502  * The return value should be freed by the caller.
503  */
504 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c,bool * spilled)505 v3d_register_allocate(struct v3d_compile *c, bool *spilled)
506 {
507         struct node_to_temp_map map[c->num_temps];
508         uint32_t temp_to_node[c->num_temps];
509         uint8_t class_bits[c->num_temps];
510         int acc_nodes[ACC_COUNT];
511         struct v3d_ra_select_callback_data callback_data = {
512                 .next_acc = 0,
513                 /* Start at RF3, to try to keep the TLB writes from using
514                  * RF0-2.
515                  */
516                 .next_phys = 3,
517         };
518 
519         *spilled = false;
520 
521         vir_calculate_live_intervals(c);
522 
523         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
524          *
525          * V3D 4.x has double the physical register space, so 64 physical regs
526          * are available at both 1x and 2x threading, and 4x has 32.
527          */
528         int thread_index = ffs(c->threads) - 1;
529         if (c->devinfo->ver >= 40) {
530                 if (thread_index >= 1)
531                         thread_index--;
532         }
533 
534         struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
535                                                          c->num_temps +
536                                                          ARRAY_SIZE(acc_nodes));
537         ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
538 
539         /* Make some fixed nodes for the accumulators, which we will need to
540          * interfere with when ops have implied r3/r4 writes or for the thread
541          * switches.  We could represent these as classes for the nodes to
542          * live in, but the classes take up a lot of memory to set up, so we
543          * don't want to make too many.
544          */
545         for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
546                 acc_nodes[i] = c->num_temps + i;
547                 ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
548         }
549 
550         for (uint32_t i = 0; i < c->num_temps; i++) {
551                 map[i].temp = i;
552                 map[i].priority = c->temp_end[i] - c->temp_start[i];
553         }
554         qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
555         for (uint32_t i = 0; i < c->num_temps; i++) {
556                 temp_to_node[map[i].temp] = i;
557         }
558 
559         /* Figure out our register classes and preallocated registers.  We
560          * start with any temp being able to be in any file, then instructions
561          * incrementally remove bits that the temp definitely can't be in.
562          */
563         memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
564 
565         int ip = 0;
566         vir_for_each_inst_inorder(inst, c) {
567                 /* If the instruction writes r3/r4 (and optionally moves its
568                  * result to a temp), nothing else can be stored in r3/r4 across
569                  * it.
570                  */
571                 if (vir_writes_r3(c->devinfo, inst)) {
572                         for (int i = 0; i < c->num_temps; i++) {
573                                 if (c->temp_start[i] < ip &&
574                                     c->temp_end[i] > ip) {
575                                         ra_add_node_interference(g,
576                                                                  temp_to_node[i],
577                                                                  acc_nodes[3]);
578                                 }
579                         }
580                 }
581                 if (vir_writes_r4(c->devinfo, inst)) {
582                         for (int i = 0; i < c->num_temps; i++) {
583                                 if (c->temp_start[i] < ip &&
584                                     c->temp_end[i] > ip) {
585                                         ra_add_node_interference(g,
586                                                                  temp_to_node[i],
587                                                                  acc_nodes[4]);
588                                 }
589                         }
590                 }
591 
592                 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
593                         switch (inst->qpu.alu.add.op) {
594                         case V3D_QPU_A_LDVPMV_IN:
595                         case V3D_QPU_A_LDVPMV_OUT:
596                         case V3D_QPU_A_LDVPMD_IN:
597                         case V3D_QPU_A_LDVPMD_OUT:
598                         case V3D_QPU_A_LDVPMP:
599                         case V3D_QPU_A_LDVPMG_IN:
600                         case V3D_QPU_A_LDVPMG_OUT:
601                                 /* LDVPMs only store to temps (the MA flag
602                                  * decides whether the LDVPM is in or out)
603                                  */
604                                 assert(inst->dst.file == QFILE_TEMP);
605                                 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
606                                 break;
607 
608                         case V3D_QPU_A_RECIP:
609                         case V3D_QPU_A_RSQRT:
610                         case V3D_QPU_A_EXP:
611                         case V3D_QPU_A_LOG:
612                         case V3D_QPU_A_SIN:
613                         case V3D_QPU_A_RSQRT2:
614                                 /* The SFU instructions write directly to the
615                                  * phys regfile.
616                                  */
617                                 assert(inst->dst.file == QFILE_TEMP);
618                                 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
619                                 break;
620 
621                         default:
622                                 break;
623                         }
624                 }
625 
626                 if (inst->src[0].file == QFILE_REG) {
627                         switch (inst->src[0].index) {
628                         case 0:
629                         case 1:
630                         case 2:
631                         case 3:
632                                 /* Payload setup instructions: Force allocate
633                                  * the dst to the given register (so the MOV
634                                  * will disappear).
635                                  */
636                                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
637                                 assert(inst->dst.file == QFILE_TEMP);
638                                 ra_set_node_reg(g,
639                                                 temp_to_node[inst->dst.index],
640                                                 PHYS_INDEX +
641                                                 inst->src[0].index);
642                                 break;
643                         }
644                 }
645 
646                 if (inst->dst.file == QFILE_TEMP) {
647                         /* Only a ldunif gets to write to R5, which only has a
648                          * single 32-bit channel of storage.
649                          */
650                         if (!inst->qpu.sig.ldunif) {
651                                 class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
652                         } else {
653                                 /* Until V3D 4.x, we could only load a uniform
654                                  * to r5, so we'll need to spill if uniform
655                                  * loads interfere with each other.
656                                  */
657                                 if (c->devinfo->ver < 40) {
658                                         class_bits[inst->dst.index] &=
659                                                 CLASS_BIT_R5;
660                                 }
661                         }
662                 }
663 
664                 if (inst->qpu.sig.thrsw) {
665                         /* All accumulators are invalidated across a thread
666                          * switch.
667                          */
668                         for (int i = 0; i < c->num_temps; i++) {
669                                 if (c->temp_start[i] < ip && c->temp_end[i] > ip)
670                                         class_bits[i] &= CLASS_BIT_PHYS;
671                         }
672                 }
673 
674                 ip++;
675         }
676 
677         for (uint32_t i = 0; i < c->num_temps; i++) {
678                 if (class_bits[i] == CLASS_BIT_PHYS) {
679                         ra_set_node_class(g, temp_to_node[i],
680                                           c->compiler->reg_class_phys[thread_index]);
681                 } else if (class_bits[i] == (CLASS_BIT_R5)) {
682                         ra_set_node_class(g, temp_to_node[i],
683                                           c->compiler->reg_class_r5[thread_index]);
684                 } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
685                         ra_set_node_class(g, temp_to_node[i],
686                                           c->compiler->reg_class_phys_or_acc[thread_index]);
687                 } else {
688                         assert(class_bits[i] == CLASS_BITS_ANY);
689                         ra_set_node_class(g, temp_to_node[i],
690                                           c->compiler->reg_class_any[thread_index]);
691                 }
692         }
693 
694         for (uint32_t i = 0; i < c->num_temps; i++) {
695                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
696                         if (!(c->temp_start[i] >= c->temp_end[j] ||
697                               c->temp_start[j] >= c->temp_end[i])) {
698                                 ra_add_node_interference(g,
699                                                          temp_to_node[i],
700                                                          temp_to_node[j]);
701                         }
702                 }
703         }
704 
705         /* Debug code to force a bit of register spilling, for running across
706          * conformance tests to make sure that spilling works.
707          */
708         int force_register_spills = 0;
709         if (c->spill_size <
710             V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
711                 int node = v3d_choose_spill_node(c, g, temp_to_node);
712                 if (node != -1) {
713                         v3d_spill_reg(c, map[node].temp);
714                         ralloc_free(g);
715                         *spilled = true;
716                         return NULL;
717                 }
718         }
719 
720         bool ok = ra_allocate(g);
721         if (!ok) {
722                 const uint32_t spill_batch_size = get_spill_batch_size(c);
723 
724                 for (uint32_t i = 0; i < spill_batch_size; i++) {
725                         int node = v3d_choose_spill_node(c, g, temp_to_node);
726                         if (node == -1)
727                            break;
728 
729                         /* TMU spills inject thrsw signals that invalidate
730                          * accumulators, so we can't batch them.
731                          */
732                         bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
733                         if (i > 0 && !is_uniform)
734                                 break;
735 
736                         /* Don't emit spills using the TMU until we've dropped
737                          * thread count first.
738                          */
739                         if (is_uniform || thread_index == 0) {
740                                 v3d_spill_reg(c, map[node].temp);
741 
742                                 /* Ask the outer loop to call back in. */
743                                 *spilled = true;
744 
745                                 /* See comment above about batching TMU spills.
746                                  */
747                                 if (!is_uniform) {
748                                         assert(i == 0);
749                                         break;
750                                 }
751                         } else {
752                                 break;
753                         }
754                 }
755 
756                 ralloc_free(g);
757                 return NULL;
758         }
759 
760         struct qpu_reg *temp_registers = calloc(c->num_temps,
761                                                 sizeof(*temp_registers));
762 
763         for (uint32_t i = 0; i < c->num_temps; i++) {
764                 int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
765                 if (ra_reg < PHYS_INDEX) {
766                         temp_registers[i].magic = true;
767                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
768                                                    ra_reg - ACC_INDEX);
769                 } else {
770                         temp_registers[i].magic = false;
771                         temp_registers[i].index = ra_reg - PHYS_INDEX;
772                 }
773         }
774 
775         ralloc_free(g);
776 
777         return temp_registers;
778 }
779