• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28 
29 #define ACC_INDEX     0
30 #define ACC_COUNT     6
31 
32 /* RA nodes used to track RF registers with implicit writes */
33 #define IMPLICIT_RF_COUNT 1
34 
35 #define PHYS_COUNT 64
36 
37 static uint8_t
get_phys_index(const struct v3d_device_info * devinfo)38 get_phys_index(const struct v3d_device_info *devinfo)
39 {
40         if (devinfo->has_accumulators)
41                 return ACC_INDEX + ACC_COUNT;
42         else
43                 return 0;
44 }
45 
46 /* ACC as accumulator */
47 #define CLASS_BITS_PHYS   (1 << 0)
48 #define CLASS_BITS_ACC    (1 << 1)
49 #define CLASS_BITS_R5     (1 << 4)
50 
51 static inline bool
stage_has_payload(struct v3d_compile * c)52 stage_has_payload(struct v3d_compile *c)
53 {
54         return c->s->info.stage == MESA_SHADER_FRAGMENT ||
55                c->s->info.stage == MESA_SHADER_COMPUTE;
56 }
57 
58 static uint8_t
get_class_bit_any(const struct v3d_device_info * devinfo)59 get_class_bit_any(const struct v3d_device_info *devinfo)
60 {
61         if (devinfo->has_accumulators)
62                 return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
63         else
64                 return CLASS_BITS_PHYS;
65 }
66 
67 static uint8_t
filter_class_bits(const struct v3d_device_info * devinfo,uint8_t class_bits)68 filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
69 {
70    if (!devinfo->has_accumulators) {
71       assert(class_bits & CLASS_BITS_PHYS);
72       class_bits = CLASS_BITS_PHYS;
73    }
74    return class_bits;
75 }
76 
77 static inline uint32_t
temp_to_node(struct v3d_compile * c,uint32_t temp)78 temp_to_node(struct v3d_compile *c, uint32_t temp)
79 {
80         return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
81                                                       IMPLICIT_RF_COUNT);
82 }
83 
84 static inline uint32_t
node_to_temp(struct v3d_compile * c,uint32_t node)85 node_to_temp(struct v3d_compile *c, uint32_t node)
86 {
87         assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
88                (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
89         return node - (c->devinfo->has_accumulators ? ACC_COUNT :
90                                                       IMPLICIT_RF_COUNT);
91 }
92 
93 static inline uint8_t
get_temp_class_bits(struct v3d_compile * c,uint32_t temp)94 get_temp_class_bits(struct v3d_compile *c,
95                     uint32_t temp)
96 {
97         return c->nodes.info[temp_to_node(c, temp)].class_bits;
98 }
99 
100 static inline void
set_temp_class_bits(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)101 set_temp_class_bits(struct v3d_compile *c,
102                     uint32_t temp, uint8_t class_bits)
103 {
104         c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
105 }
106 
107 static struct ra_class *
choose_reg_class(struct v3d_compile * c,uint8_t class_bits)108 choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
109 {
110         if (class_bits == CLASS_BITS_PHYS) {
111                 return c->compiler->reg_class_phys[c->thread_index];
112         } else if (class_bits == (CLASS_BITS_R5)) {
113                 assert(c->devinfo->has_accumulators);
114                 return c->compiler->reg_class_r5[c->thread_index];
115         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
116                 assert(c->devinfo->has_accumulators);
117                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
118         } else {
119                 assert(class_bits == get_class_bit_any(c->devinfo));
120                 return c->compiler->reg_class_any[c->thread_index];
121         }
122 }
123 
124 static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile * c,uint32_t temp)125 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
126 {
127         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
128         return choose_reg_class(c, get_temp_class_bits(c, temp));
129 }
130 
131 static inline bool
qinst_writes_tmu(const struct v3d_device_info * devinfo,struct qinst * inst)132 qinst_writes_tmu(const struct v3d_device_info *devinfo,
133                  struct qinst *inst)
134 {
135         return (inst->dst.file == QFILE_MAGIC &&
136                 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||
137                 inst->qpu.sig.wrtmuc;
138 }
139 
140 static bool
is_end_of_tmu_sequence(const struct v3d_device_info * devinfo,struct qinst * inst,struct qblock * block)141 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
142                        struct qinst *inst, struct qblock *block)
143 {
144         /* Only tmuwt and ldtmu can finish TMU sequences */
145         bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
146                         inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
147         bool is_ldtmu = inst->qpu.sig.ldtmu;
148         if (!is_tmuwt && !is_ldtmu)
149                 return false;
150 
151         /* Check if this is the last tmuwt or ldtmu in the sequence */
152         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
153                                  &block->instructions, link) {
154                 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
155                            scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
156                 is_ldtmu = scan_inst->qpu.sig.ldtmu;
157 
158                 if (is_tmuwt || is_ldtmu)
159                         return false;
160 
161                 if (qinst_writes_tmu(devinfo, scan_inst))
162                         return true;
163         }
164 
165         return true;
166 }
167 
168 static bool
vir_is_mov_uniform(struct v3d_compile * c,int temp)169 vir_is_mov_uniform(struct v3d_compile *c, int temp)
170 {
171         struct qinst *def = c->defs[temp];
172 
173         return def && def->qpu.sig.ldunif;
174 }
175 
176 static bool
can_reconstruct_inst(struct qinst * inst)177 can_reconstruct_inst(struct qinst *inst)
178 {
179         assert(inst);
180 
181         if (vir_is_add(inst)) {
182                 switch (inst->qpu.alu.add.op) {
183                 case V3D_QPU_A_FXCD:
184                 case V3D_QPU_A_FYCD:
185                 case V3D_QPU_A_XCD:
186                 case V3D_QPU_A_YCD:
187                 case V3D_QPU_A_IID:
188                 case V3D_QPU_A_EIDX:
189                 case V3D_QPU_A_TIDX:
190                 case V3D_QPU_A_SAMPID:
191                         /* No need to check input unpacks because none of these
192                          * opcodes read sources. FXCD,FYCD have pack variants.
193                          */
194                         return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
195                                inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
196                                inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
197                                inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
198                 default:
199                         return false;
200                 }
201         }
202 
203         return false;
204 }
205 
206 static bool
can_reconstruct_temp(struct v3d_compile * c,int temp)207 can_reconstruct_temp(struct v3d_compile *c, int temp)
208 {
209         struct qinst *def = c->defs[temp];
210         return def && can_reconstruct_inst(def);
211 }
212 
213 static struct qreg
reconstruct_temp(struct v3d_compile * c,enum v3d_qpu_add_op op)214 reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
215 {
216         struct qreg dest;
217         switch (op) {
218         case V3D_QPU_A_FXCD:
219                 dest = vir_FXCD(c);
220                 break;
221         case V3D_QPU_A_FYCD:
222                 dest = vir_FYCD(c);
223                 break;
224         case V3D_QPU_A_XCD:
225                 dest = vir_XCD(c);
226                 break;
227         case V3D_QPU_A_YCD:
228                 dest = vir_YCD(c);
229                 break;
230         case V3D_QPU_A_IID:
231                 dest = vir_IID(c);
232                 break;
233         case V3D_QPU_A_EIDX:
234                 dest = vir_EIDX(c);
235                 break;
236         case V3D_QPU_A_TIDX:
237                 dest = vir_TIDX(c);
238                 break;
239         case V3D_QPU_A_SAMPID:
240                 dest = vir_SAMPID(c);
241                 break;
242         default:
243             unreachable("Unexpected opcode for reconstruction");
244         }
245 
246         return dest;
247 }
248 
249 enum temp_spill_type {
250         SPILL_TYPE_UNIFORM,
251         SPILL_TYPE_RECONSTRUCT,
252         SPILL_TYPE_TMU
253 };
254 
255 static enum temp_spill_type
get_spill_type_for_temp(struct v3d_compile * c,int temp)256 get_spill_type_for_temp(struct v3d_compile *c, int temp)
257 {
258    if (vir_is_mov_uniform(c, temp))
259       return SPILL_TYPE_UNIFORM;
260 
261    if (can_reconstruct_temp(c, temp))
262       return SPILL_TYPE_RECONSTRUCT;
263 
264    return SPILL_TYPE_TMU;
265 }
266 
267 static int
v3d_choose_spill_node(struct v3d_compile * c)268 v3d_choose_spill_node(struct v3d_compile *c)
269 {
270         assert(c->num_temps > 1);
271 
272         const float tmu_scale = 10;
273         float block_scale = 1.0;
274         float spill_costs[c->num_temps];
275         bool in_tmu_operation = false;
276         bool rtop_hazard = false;
277         bool started_last_seg = false;
278 
279         for (unsigned i = 0; i < c->num_temps; i++)
280                 spill_costs[i] = 0.0;
281 
282         /* XXX: Scale the cost up when inside of a loop. */
283         vir_for_each_block(block, c) {
284                 vir_for_each_inst(inst, block) {
285                         /* RTOP is not preserved across thread switches, so
286                          * we can't spill in the middle of multop + umul24.
287                          */
288                         bool is_multop = false;
289                         bool is_umul24 = false;
290                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
291                                 if (inst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) {
292                                     is_multop = true;
293                                     rtop_hazard = true;
294                                 } else if (inst->qpu.alu.mul.op == V3D_QPU_M_UMUL24) {
295                                     is_umul24 = true;
296                                 }
297                         }
298 
299                         /* We can't insert new thread switches after
300                          * starting output writes.
301                          */
302                         bool no_spilling =
303                                 (c->threads > 1 && started_last_seg) ||
304                                 (c->max_tmu_spills == 0);
305 
306                         /* Discourage spilling of TMU operations */
307                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
308                                 if (inst->src[i].file != QFILE_TEMP)
309                                         continue;
310 
311                                 int temp = inst->src[i].index;
312                                 enum temp_spill_type spill_type =
313                                         get_spill_type_for_temp(c, temp);
314 
315                                 if (spill_type != SPILL_TYPE_TMU) {
316                                         spill_costs[temp] += block_scale;
317                                 } else if (!no_spilling && (!rtop_hazard || is_multop)) {
318                                         float tmu_op_scale = in_tmu_operation ?
319                                                 3.0 : 1.0;
320                                         spill_costs[temp] += (block_scale *
321                                                               tmu_scale *
322                                                               tmu_op_scale);
323                                 } else {
324                                         BITSET_CLEAR(c->spillable, temp);
325                                 }
326                         }
327 
328                         if (inst->dst.file == QFILE_TEMP) {
329                                 int temp = inst->dst.index;
330                                 enum temp_spill_type spill_type =
331                                         get_spill_type_for_temp(c, temp);
332 
333                                 if (spill_type != SPILL_TYPE_TMU) {
334                                         /* We just rematerialize it later */
335                                 } else if (!no_spilling && (!rtop_hazard || is_umul24)) {
336                                         spill_costs[temp] += (block_scale *
337                                                               tmu_scale);
338                                 } else {
339                                         BITSET_CLEAR(c->spillable, temp);
340                                 }
341                         }
342 
343                         /* Refuse to spill a ldvary's dst, because that means
344                          * that ldvary's r5 would end up being used across a
345                          * thrsw.
346                          */
347                         if (inst->qpu.sig.ldvary) {
348                                 assert(inst->dst.file == QFILE_TEMP);
349                                 BITSET_CLEAR(c->spillable, inst->dst.index);
350                         }
351 
352                         if (inst->is_last_thrsw)
353                                 started_last_seg = true;
354 
355                         /* Track when we're in between a TMU setup and the
356                          * final LDTMU or TMUWT from that TMU setup.  We
357                          * penalize spills during that time.
358                          */
359                         if (is_end_of_tmu_sequence(c->devinfo, inst, block))
360                                 in_tmu_operation = false;
361 
362                         if (qinst_writes_tmu(c->devinfo, inst))
363                                 in_tmu_operation = true;
364 
365                         if (is_umul24)
366                                 rtop_hazard = false;
367                 }
368         }
369 
370         /* We always emit a "last thrsw" to ensure all our spilling occurs
371          * before the last thread section. See vir_emit_last_thrsw.
372          */
373         assert(started_last_seg);
374 
375         for (unsigned i = 0; i < c->num_temps; i++) {
376                 if (BITSET_TEST(c->spillable, i)) {
377                         ra_set_node_spill_cost(c->g, temp_to_node(c, i),
378                                                spill_costs[i]);
379                 }
380         }
381 
382         return ra_get_best_spill_node(c->g);
383 }
384 
385 static void
ensure_nodes(struct v3d_compile * c)386 ensure_nodes(struct v3d_compile *c)
387 {
388         if (c->num_temps < c->nodes.alloc_count)
389                 return;
390 
391         c->nodes.alloc_count *= 2;
392         c->nodes.info = reralloc_array_size(c,
393                                             c->nodes.info,
394                                             sizeof(c->nodes.info[0]),
395                                             c->nodes.alloc_count +
396                                             MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
397 }
398 
399 /* Creates the interference node for a new temp. We use this to keep the node
400  * list updated during the spilling process, which generates new temps/nodes.
401  */
402 static int
add_node(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)403 add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
404 {
405         ensure_nodes(c);
406 
407         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
408         assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
409                                               node == temp + IMPLICIT_RF_COUNT);
410 
411         /* We fill the node priority after we are done inserting spills */
412         c->nodes.info[node].class_bits = class_bits;
413         c->nodes.info[node].priority = 0;
414         c->nodes.info[node].try_rf0 = false;
415         c->nodes.info[node].is_program_end = false;
416         c->nodes.info[node].unused = false;
417         c->nodes.info[node].payload_conflict = false;
418 
419         return node;
420 }
421 
422 /* The spill offset for this thread takes a bit of setup, so do it once at
423  * program start.
424  */
425 void
v3d_setup_spill_base(struct v3d_compile * c)426 v3d_setup_spill_base(struct v3d_compile *c)
427 {
428         /* Setting up the spill base is done in the entry block; so change
429          * both the current block to emit and the cursor.
430          */
431         struct qblock *current_block = c->cur_block;
432         c->cur_block = vir_entry_block(c);
433         c->cursor = vir_before_block(c->cur_block);
434 
435         int start_num_temps = c->num_temps;
436 
437         /* Each thread wants to be in a separate region of the scratch space
438          * so that the QPUs aren't fighting over cache lines.  We have the
439          * driver keep a single global spill BO rather than
440          * per-spilling-program BOs, so we need a uniform from the driver for
441          * what the per-thread scale is.
442          */
443         struct qreg thread_offset =
444                 vir_UMUL(c,
445                          vir_TIDX(c),
446                          vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
447 
448         /* Each channel in a reg is 4 bytes, so scale them up by that. */
449         struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
450                                              vir_uniform_ui(c, 2));
451 
452         c->spill_base = vir_ADD(c,
453                                 vir_ADD(c, thread_offset, element_offset),
454                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
455 
456         /* Make sure that we don't spill the spilling setup instructions. */
457         for (int i = start_num_temps; i < c->num_temps; i++) {
458                 BITSET_CLEAR(c->spillable, i);
459 
460                 /* If we are spilling, update the RA map with the temps added
461                  * by the spill setup. Our spill_base register can never be an
462                  * accumulator because it is used for TMU spill/fill and thus
463                  * needs to persist across thread switches.
464                  */
465                 if (c->spilling) {
466                         int temp_class = CLASS_BITS_PHYS;
467                         if (c->devinfo->has_accumulators &&
468                             i != c->spill_base.index) {
469                                 temp_class |= CLASS_BITS_ACC;
470                         }
471                         int node = add_node(c, i, temp_class);
472                         c->nodes.info[node].payload_conflict =
473                                 stage_has_payload(c);
474                 }
475         }
476 
477         /* Restore the current block. */
478         c->cur_block = current_block;
479         c->cursor = vir_after_block(c->cur_block);
480 }
481 
482 /**
483  * Computes the address for a spill/fill sequence and completes the spill/fill
484  * sequence by emitting the following code:
485  *
486  * ldunif.spill_offset
487  * add tmua spill_base spill_offset
488  * thrsw
489  *
490  * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
491  * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
492  *
493  * The parameter 'ip' represents the ip at which the spill/fill is happening.
494  * This is used to disallow accumulators on temps that cross this ip boundary
495  * due to the new thrsw itroduced in the sequence above.
496  */
497 static void
v3d_emit_spill_tmua(struct v3d_compile * c,uint32_t spill_offset,enum v3d_qpu_cond cond,int32_t ip,struct qreg * fill_dst)498 v3d_emit_spill_tmua(struct v3d_compile *c,
499                     uint32_t spill_offset,
500                     enum v3d_qpu_cond cond,
501                     int32_t ip,
502                     struct qreg *fill_dst)
503 {
504         assert(ip >= 0);
505 
506         /* Load a uniform with the spill offset and add it to the spill base
507          * to obtain the TMUA address. It can be of class ANY because we know
508          * we are consuming it immediately without thrsw in between.
509          */
510         assert(c->disable_ldunif_opt);
511         struct qreg offset = vir_uniform_ui(c, spill_offset);
512         add_node(c, offset.index, get_class_bit_any(c->devinfo));
513 
514         /* We always enable per-quad on spills/fills to ensure we spill
515          * any channels involved with helper invocations, but only if
516          * the spill is not conditional, since otherwise we may be spilling
517          * invalida lanes and overwriting valid data from a previous spill
518          * to the same address.
519          */
520         struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
521         struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
522         inst->qpu.flags.ac = cond;
523         inst->ldtmu_count = 1;
524         inst->uniform =
525                 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
526                                       cond != V3D_QPU_COND_NONE ?
527                                               0xffffffff : 0xffffff7f /* per-quad*/);
528 
529         vir_emit_thrsw(c);
530 
531         /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
532          * result of the fill. The TMUWT temp is not really read, the ldtmu
533          * temp will be used immediately so just like the uniform above we
534          * can allow accumulators.
535          */
536         int temp_class =
537                 filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
538         if (!fill_dst) {
539                 struct qreg dst = vir_TMUWT(c);
540                 assert(dst.file == QFILE_TEMP);
541                 add_node(c, dst.index, temp_class);
542         } else {
543                 *fill_dst = vir_LDTMU(c);
544                 assert(fill_dst->file == QFILE_TEMP);
545                 add_node(c, fill_dst->index, temp_class);
546         }
547 
548         /* Temps across the thread switch we injected can't be assigned to
549          * accumulators.
550          *
551          * Fills inject code before ip, so anything that starts at ip or later
552          * is not affected by the thrsw. Something that ends at ip will be
553          * affected though.
554          *
555          * Spills inject code after ip, so anything that starts strictly later
556          * than ip is not affected (the temp starting at ip is usually the
557          * spilled temp except for postponed spills). Something that ends at ip
558          * won't be affected either.
559          */
560         for (int i = 0; i < c->spill_start_num_temps; i++) {
561                 bool thrsw_cross = fill_dst ?
562                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
563                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
564                 if (thrsw_cross) {
565                         ra_set_node_class(c->g, temp_to_node(c, i),
566                                           choose_reg_class(c, CLASS_BITS_PHYS));
567                 }
568         }
569 }
570 
571 static void
v3d_emit_tmu_spill(struct v3d_compile * c,struct qinst * inst,struct qreg spill_temp,struct qinst * position,uint32_t ip,uint32_t spill_offset)572 v3d_emit_tmu_spill(struct v3d_compile *c,
573                    struct qinst *inst,
574                    struct qreg spill_temp,
575                    struct qinst *position,
576                    uint32_t ip,
577                    uint32_t spill_offset)
578 {
579         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
580         assert(inst->dst.file == QFILE_TEMP);
581 
582         c->cursor = vir_after_inst(position);
583 
584         enum v3d_qpu_cond cond = vir_get_cond(inst);
585 
586         /* If inst and position don't match, this is a postponed spill,
587          * in which case we have already allocated the temp for the spill
588          * and we should use that, otherwise create a new temp with the
589          * same register class bits as the original.
590          */
591         if (inst == position) {
592                 uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
593                 inst->dst = vir_get_temp(c);
594                 add_node(c, inst->dst.index, class_bits);
595         } else {
596                 inst->dst = spill_temp;
597 
598                 /* If this is a postponed spill the register being spilled may
599                  * have been written more than once including conditional
600                  * writes, so ignore predication on the spill instruction and
601                  * always spill the full register.
602                  */
603                 cond = V3D_QPU_COND_NONE;
604         }
605 
606         struct qinst *tmp =
607                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
608                              inst->dst);
609         tmp->qpu.flags.mc = cond;
610 
611         v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
612 
613         c->spills++;
614         c->tmu_dirty_rcl = true;
615 }
616 
617 static inline bool
interferes(int32_t t0_start,int32_t t0_end,int32_t t1_start,int32_t t1_end)618 interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
619 {
620         return !(t0_start >= t1_end || t1_start >= t0_end);
621 }
622 
623 static void
v3d_spill_reg(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int spill_temp)624 v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
625               int spill_temp)
626 {
627         c->spill_start_num_temps = c->num_temps;
628         c->spilling = true;
629 
630         enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
631 
632         uint32_t spill_offset = 0;
633         if (spill_type == SPILL_TYPE_TMU) {
634                 spill_offset = c->spill_size;
635                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
636 
637                 if (spill_offset == 0) {
638                         v3d_setup_spill_base(c);
639 
640                         /* Don't allocate our spill base to rf0 to avoid
641                          * conflicts with instructions doing implicit writes
642                          * to that register.
643                          */
644                         if (!c->devinfo->has_accumulators) {
645                                 ra_add_node_interference(
646                                         c->g,
647                                         temp_to_node(c, c->spill_base.index),
648                                         implicit_rf_nodes[0]);
649                         }
650                 }
651         }
652 
653         struct qinst *last_thrsw = c->last_thrsw;
654         assert(last_thrsw && last_thrsw->is_last_thrsw);
655 
656         int uniform_index = ~0;
657         if (spill_type == SPILL_TYPE_UNIFORM) {
658                 struct qinst *orig_unif = c->defs[spill_temp];
659                 uniform_index = orig_unif->uniform;
660         }
661 
662         enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
663         if (spill_type == SPILL_TYPE_RECONSTRUCT) {
664                 struct qinst *orig_def = c->defs[spill_temp];
665                 assert(vir_is_add(orig_def));
666                 reconstruct_op = orig_def->qpu.alu.add.op;
667         }
668 
669         uint32_t spill_node = temp_to_node(c, spill_temp);
670 
671         /* We must disable the ldunif optimization if we are spilling uniforms */
672         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
673         c->disable_ldunif_opt = true;
674 
675         struct qinst *start_of_tmu_sequence = NULL;
676         struct qinst *postponed_spill = NULL;
677         struct qreg postponed_spill_temp = { 0 };
678         vir_for_each_block(block, c) {
679                 vir_for_each_inst_safe(inst, block) {
680                         int32_t ip = inst->ip;
681 
682                         /* Track when we're in between a TMU setup and the final
683                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
684                          * temps during that time, because that involves inserting a
685                          * new TMU setup/LDTMU sequence, so we postpone the spill or
686                          * move the fill up to not intrude in the middle of the TMU
687                          * sequence.
688                          */
689                         if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
690                                 if (postponed_spill) {
691                                         v3d_emit_tmu_spill(c, postponed_spill,
692                                                            postponed_spill_temp,
693                                                            inst, ip, spill_offset);
694                                 }
695 
696                                 start_of_tmu_sequence = NULL;
697                                 postponed_spill = NULL;
698                         }
699 
700                         if (!start_of_tmu_sequence &&
701                             qinst_writes_tmu(c->devinfo, inst)) {
702                                 start_of_tmu_sequence = inst;
703                         }
704 
705                         /* fills */
706                         int filled_src = -1;
707                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
708                                 if (inst->src[i].file != QFILE_TEMP ||
709                                     inst->src[i].index != spill_temp) {
710                                         continue;
711                                 }
712 
713                                 if (filled_src >= 0) {
714                                         inst->src[i] = inst->src[filled_src];
715                                         continue;
716                                 }
717 
718                                 c->cursor = vir_before_inst(inst);
719 
720                                 if (spill_type == SPILL_TYPE_UNIFORM) {
721                                         struct qreg unif =
722                                                 vir_uniform(c,
723                                                             c->uniform_contents[uniform_index],
724                                                             c->uniform_data[uniform_index]);
725                                         inst->src[i] = unif;
726                                         /* We are using the uniform in the
727                                          * instruction immediately after, so
728                                          * we can use any register class for it.
729                                          */
730                                         add_node(c, unif.index,
731                                                  get_class_bit_any(c->devinfo));
732                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
733                                         struct qreg temp =
734                                                 reconstruct_temp(c, reconstruct_op);
735                                         inst->src[i] = temp;
736                                         /* We are using the temp in the
737                                          * instruction immediately after so we
738                                          * can use ACC.
739                                          */
740                                         int temp_class =
741                                                 filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
742                                                                               CLASS_BITS_ACC);
743                                         add_node(c, temp.index, temp_class);
744                                 } else {
745                                         /* If we have a postponed spill, we
746                                          * don't need a fill as the temp would
747                                          * not have been spilled yet, however,
748                                          * we need to update the temp index.
749                                          */
750                                         if (postponed_spill) {
751                                                 inst->src[i] =
752                                                         postponed_spill_temp;
753                                         } else {
754                                                 int32_t fill_ip = ip;
755                                                 if (start_of_tmu_sequence) {
756                                                         c->cursor = vir_before_inst(start_of_tmu_sequence);
757                                                         fill_ip = start_of_tmu_sequence->ip;
758                                                 }
759 
760                                                 v3d_emit_spill_tmua(c,  spill_offset,
761                                                                     V3D_QPU_COND_NONE,
762                                                                     fill_ip, &inst->src[i]);
763                                                 c->fills++;
764                                         }
765                                 }
766 
767                                 filled_src = i;
768                         }
769 
770                         /* spills */
771                         if (inst->dst.file == QFILE_TEMP &&
772                             inst->dst.index == spill_temp) {
773                                 if (spill_type != SPILL_TYPE_TMU) {
774                                         c->cursor.link = NULL;
775                                         vir_remove_instruction(c, inst);
776                                 } else {
777                                         /* If we are in the middle of a TMU
778                                          * sequence, we postpone the actual
779                                          * spill until we have finished it. We,
780                                          * still need to replace the spill temp
781                                          * with a new temp though.
782                                          */
783                                         if (start_of_tmu_sequence) {
784                                                 if (postponed_spill) {
785                                                         postponed_spill->dst =
786                                                                 postponed_spill_temp;
787                                                 }
788                                                 if (!postponed_spill ||
789                                                     vir_get_cond(inst) == V3D_QPU_COND_NONE) {
790                                                         postponed_spill_temp =
791                                                                 vir_get_temp(c);
792                                                         add_node(c,
793                                                                  postponed_spill_temp.index,
794                                                                  c->nodes.info[spill_node].class_bits);
795                                                 }
796                                                 postponed_spill = inst;
797                                         } else {
798                                                 v3d_emit_tmu_spill(c, inst,
799                                                                    postponed_spill_temp,
800                                                                    inst, ip,
801                                                                    spill_offset);
802                                         }
803                                 }
804                         }
805                 }
806         }
807 
808         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
809          * inserted in our most recent unspill.
810          */
811         c->last_thrsw = last_thrsw;
812 
813         /* Don't allow spilling of our spilling instructions.  There's no way
814          * they can help get things colored.
815          */
816         for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
817                 BITSET_CLEAR(c->spillable, i);
818 
819         /* Reset interference for spilled node */
820         ra_set_node_spill_cost(c->g, spill_node, 0);
821         ra_reset_node_interference(c->g, spill_node);
822         BITSET_CLEAR(c->spillable, spill_temp);
823 
824         /* Rebuild program ips */
825         int32_t ip = 0;
826         vir_for_each_inst_inorder(inst, c)
827                 inst->ip = ip++;
828 
829         /* Rebuild liveness */
830         vir_calculate_live_intervals(c);
831 
832         /* Add interferences for the new spilled temps and update interferences
833          * for c->spill_base (since we may have modified its liveness). Also,
834          * update node priorities based one new liveness data.
835          */
836         uint32_t sb_temp =c->spill_base.index;
837         uint32_t sb_node = temp_to_node(c, sb_temp);
838         for (uint32_t i = 0; i < c->num_temps; i++) {
839                 if (c->temp_end[i] == -1)
840                         continue;
841 
842                 uint32_t node_i = temp_to_node(c, i);
843                 c->nodes.info[node_i].priority =
844                         c->temp_end[i] - c->temp_start[i];
845 
846                 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
847                      j < c->num_temps; j++) {
848                         if (interferes(c->temp_start[i], c->temp_end[i],
849                                        c->temp_start[j], c->temp_end[j])) {
850                                 uint32_t node_j = temp_to_node(c, j);
851                                 ra_add_node_interference(c->g, node_i, node_j);
852                         }
853                 }
854 
855                 if (spill_type == SPILL_TYPE_TMU) {
856                         if (i != sb_temp &&
857                             interferes(c->temp_start[i], c->temp_end[i],
858                                        c->temp_start[sb_temp], c->temp_end[sb_temp])) {
859                                 ra_add_node_interference(c->g, node_i, sb_node);
860                         }
861                 }
862         }
863 
864         c->disable_ldunif_opt = had_disable_ldunif_opt;
865         c->spilling = false;
866 }
867 
868 struct v3d_ra_select_callback_data {
869         uint32_t phys_index;
870         uint32_t next_acc;
871         uint32_t next_phys;
872         struct v3d_ra_node_info *nodes;
873         const struct v3d_device_info *devinfo;
874 };
875 
876 /* Choosing accumulators improves chances of merging QPU instructions
877  * due to these merges requiring that at most 2 rf registers are used
878  * by the add and mul instructions.
879  */
880 static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,int priority)881 v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
882                    BITSET_WORD *regs,
883                    int priority)
884 {
885         if (!v3d_ra->devinfo->has_accumulators)
886                 return false;
887 
888         /* Favor accumulators if we have less that this number of physical
889          * registers. Accumulators have more restrictions (like being
890          * invalidated through thrsw), so running out of physical registers
891          * even if we have accumulators available can lead to register
892          * allocation failures.
893          */
894         static const int available_rf_threshold = 5;
895         int available_rf = 0 ;
896         for (int i = 0; i < PHYS_COUNT; i++) {
897                 if (BITSET_TEST(regs, v3d_ra->phys_index + i))
898                         available_rf++;
899                 if (available_rf >= available_rf_threshold)
900                         break;
901         }
902         if (available_rf < available_rf_threshold)
903                 return true;
904 
905         /* Favor accumulators for short-lived temps (our priority represents
906          * liveness), to prevent long-lived temps from grabbing accumulators
907          * and preventing follow-up instructions from using them, potentially
908          * leading to large portions of the shader being unable to use
909          * accumulators and therefore merge instructions successfully.
910          */
911         static const int priority_threshold = 20;
912         if (priority <= priority_threshold)
913                 return true;
914 
915         return false;
916 }
917 
918 static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,unsigned int * out)919 v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
920                     BITSET_WORD *regs,
921                     unsigned int *out)
922 {
923         if (!v3d_ra->devinfo->has_accumulators)
924                 return false;
925 
926         /* Choose r5 for our ldunifs if possible (nobody else can load to that
927          * reg, and it keeps the QPU cond field free from being occupied by
928          * ldunifrf).
929          */
930         int r5 = ACC_INDEX + 5;
931         if (BITSET_TEST(regs, r5)) {
932                 *out = r5;
933                 return true;
934         }
935 
936         /* Round-robin through our accumulators to give post-RA instruction
937          * selection more options.
938          */
939         for (int i = 0; i < ACC_COUNT; i++) {
940                 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
941                 int acc = ACC_INDEX + acc_off;
942 
943                 if (BITSET_TEST(regs, acc)) {
944                         v3d_ra->next_acc = acc_off + 1;
945                         *out = acc;
946                         return true;
947                 }
948         }
949 
950         return false;
951 }
952 
953 static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data * v3d_ra,unsigned int node,BITSET_WORD * regs,unsigned int * out)954 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
955                  unsigned int node,
956                  BITSET_WORD *regs,
957                  unsigned int *out)
958 {
959         /* If this node is for an unused temp, ignore. */
960         if (v3d_ra->nodes->info[node].unused) {
961                 *out = 0;
962                 return true;
963         }
964 
965         /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
966          * so we can avoid turning them into ldunifrf (which uses the
967          * cond field to encode the dst and would prevent merge with
968          * instructions that use cond flags).
969          */
970         if (v3d_ra->nodes->info[node].try_rf0 &&
971             BITSET_TEST(regs, v3d_ra->phys_index)) {
972                 assert(v3d_ra->devinfo->ver >= 71);
973                 *out = v3d_ra->phys_index;
974                 return true;
975         }
976 
977         /* The last 3 instructions in a shader can't use some specific registers
978          * (usually early rf registers, depends on v3d version) so try to
979          * avoid allocating these to registers used by the last instructions
980          * in the shader. Do the same for spilling setup instructions that
981          * may conflict with payload registers.
982          */
983         const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
984         if ((v3d_ra->nodes->info[node].is_program_end ||
985              v3d_ra->nodes->info[node].payload_conflict) &&
986             v3d_ra->next_phys < safe_rf_start) {
987                 v3d_ra->next_phys = safe_rf_start;
988         }
989 
990         for (int i = 0; i < PHYS_COUNT; i++) {
991                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
992 
993                 /* Try to keep rf0 available for ldunif in 7.x (see above). */
994                 if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
995                         continue;
996 
997                 int phys = v3d_ra->phys_index + phys_off;
998 
999                 if (BITSET_TEST(regs, phys)) {
1000                         v3d_ra->next_phys = phys_off + 1;
1001                         *out = phys;
1002                         return true;
1003                 }
1004         }
1005 
1006         /* If we couldn't allocate, do try to assign rf0 if it is available. */
1007         if (v3d_ra->devinfo->ver >= 71 &&
1008             BITSET_TEST(regs, v3d_ra->phys_index)) {
1009                 v3d_ra->next_phys = 1;
1010                 *out = v3d_ra->phys_index;
1011                 return true;
1012         }
1013 
1014         return false;
1015 }
1016 
1017 static unsigned int
v3d_ra_select_callback(unsigned int n,BITSET_WORD * regs,void * data)1018 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
1019 {
1020         struct v3d_ra_select_callback_data *v3d_ra = data;
1021 
1022         unsigned int reg;
1023         if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
1024             v3d_ra_select_accum(v3d_ra, regs, &reg)) {
1025                 return reg;
1026         }
1027 
1028         if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
1029                 return reg;
1030 
1031         /* If we ran out of physical registers try to assign an accumulator
1032          * if we didn't favor that option earlier.
1033          */
1034         if (v3d_ra_select_accum(v3d_ra, regs, &reg))
1035                 return reg;
1036 
1037         unreachable("RA must pass us at least one possible reg.");
1038 }
1039 
1040 bool
vir_init_reg_sets(struct v3d_compiler * compiler)1041 vir_init_reg_sets(struct v3d_compiler *compiler)
1042 {
1043         /* Allocate up to 3 regfile classes, for the ways the physical
1044          * register file can be divided up for fragment shader threading.
1045          */
1046         int max_thread_index = 2;
1047         uint8_t phys_index = get_phys_index(compiler->devinfo);
1048 
1049         compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
1050                                           false);
1051         if (!compiler->regs)
1052                 return false;
1053 
1054         for (int threads = 0; threads < max_thread_index; threads++) {
1055                 compiler->reg_class_any[threads] =
1056                         ra_alloc_contig_reg_class(compiler->regs, 1);
1057                 if (compiler->devinfo->has_accumulators) {
1058                         compiler->reg_class_r5[threads] =
1059                                 ra_alloc_contig_reg_class(compiler->regs, 1);
1060                         compiler->reg_class_phys_or_acc[threads] =
1061                                 ra_alloc_contig_reg_class(compiler->regs, 1);
1062                 }
1063                 compiler->reg_class_phys[threads] =
1064                         ra_alloc_contig_reg_class(compiler->regs, 1);
1065 
1066                 /* Init physical regs */
1067                 for (int i = phys_index;
1068                      i < phys_index + (PHYS_COUNT >> threads); i++) {
1069                         if (compiler->devinfo->has_accumulators)
1070                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1071                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
1072                         ra_class_add_reg(compiler->reg_class_any[threads], i);
1073                 }
1074 
1075                 /* Init accumulator regs */
1076                 if (compiler->devinfo->has_accumulators) {
1077                         for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
1078                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1079                                 ra_class_add_reg(compiler->reg_class_any[threads], i);
1080                         }
1081                         /* r5 can only store a single 32-bit value, so not much can
1082                          * use it.
1083                          */
1084                         ra_class_add_reg(compiler->reg_class_r5[threads],
1085                                          ACC_INDEX + 5);
1086                         ra_class_add_reg(compiler->reg_class_any[threads],
1087                                          ACC_INDEX + 5);
1088                 }
1089         }
1090 
1091         ra_set_finalize(compiler->regs, NULL);
1092 
1093         return true;
1094 }
1095 
1096 static inline bool
tmu_spilling_allowed(struct v3d_compile * c)1097 tmu_spilling_allowed(struct v3d_compile *c)
1098 {
1099         return c->spills + c->fills < c->max_tmu_spills;
1100 }
1101 
1102 static bool
reg_is_payload(struct v3d_compile * c,struct qreg reg)1103 reg_is_payload(struct v3d_compile *c, struct qreg reg)
1104 {
1105    if (reg.file != QFILE_REG)
1106       return false;
1107 
1108    if (c->devinfo->ver >= 71) {
1109            if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1110                    return reg.index >= 1 && reg.index <= 3;
1111            if (c->s->info.stage == MESA_SHADER_COMPUTE)
1112                    return reg.index == 2 || reg.index == 3;
1113            return false;
1114    }
1115 
1116    assert(c->devinfo->ver == 42);
1117    if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1118            return reg.index <= 2;
1119    if (c->s->info.stage == MESA_SHADER_COMPUTE)
1120            return reg.index == 0 || reg.index == 2;
1121    return false;
1122 }
1123 
1124 static bool
inst_reads_payload(struct v3d_compile * c,struct qinst * inst)1125 inst_reads_payload(struct v3d_compile *c, struct qinst *inst)
1126 {
1127         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1128                 return false;
1129 
1130         if (reg_is_payload(c, inst->dst))
1131                 return true;
1132 
1133         if (reg_is_payload(c, inst->src[0]))
1134                 return true;
1135 
1136         if (vir_get_nsrc(inst) > 1 && reg_is_payload(c, inst->src[1]))
1137                 return true;
1138 
1139         return false;
1140 }
1141 
1142 static void
update_graph_and_reg_classes_for_inst(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int last_ldvary_ip,bool has_payload,struct qinst * inst)1143 update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
1144                                       int *acc_nodes,
1145                                       int *implicit_rf_nodes,
1146                                       int last_ldvary_ip,
1147                                       bool has_payload,
1148                                       struct qinst *inst)
1149 {
1150         int32_t ip = inst->ip;
1151         assert(ip >= 0);
1152 
1153         /* If the instruction writes r4 (and optionally moves its
1154          * result to a temp), nothing else can be stored in r4 across
1155          * it.
1156          */
1157         if (vir_writes_r4_implicitly(c->devinfo, inst)) {
1158                 for (int i = 0; i < c->num_temps; i++) {
1159                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1160                                 ra_add_node_interference(c->g,
1161                                                          temp_to_node(c, i),
1162                                                          acc_nodes[4]);
1163                         }
1164                 }
1165         }
1166 
1167         /* If any instruction writes to a physical register implicitly
1168          * nothing else can write the same register across it.
1169          */
1170         if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
1171                 for (int i = 0; i < c->num_temps; i++) {
1172                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1173                                 ra_add_node_interference(c->g,
1174                                                          temp_to_node(c, i),
1175                                                          implicit_rf_nodes[0]);
1176                         }
1177                 }
1178         }
1179 
1180         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
1181                 switch (inst->qpu.alu.add.op) {
1182                 case V3D_QPU_A_LDVPMV_IN:
1183                 case V3D_QPU_A_LDVPMV_OUT:
1184                 case V3D_QPU_A_LDVPMD_IN:
1185                 case V3D_QPU_A_LDVPMD_OUT:
1186                 case V3D_QPU_A_LDVPMP:
1187                 case V3D_QPU_A_LDVPMG_IN:
1188                 case V3D_QPU_A_LDVPMG_OUT: {
1189                         /* LDVPMs only store to temps (the MA flag
1190                          * decides whether the LDVPM is in or out)
1191                          */
1192                         assert(inst->dst.file == QFILE_TEMP);
1193                         set_temp_class_bits(c, inst->dst.index,
1194                                             CLASS_BITS_PHYS);
1195                         break;
1196                 }
1197 
1198                 case V3D_QPU_A_RECIP:
1199                 case V3D_QPU_A_RSQRT:
1200                 case V3D_QPU_A_EXP:
1201                 case V3D_QPU_A_LOG:
1202                 case V3D_QPU_A_SIN:
1203                 case V3D_QPU_A_RSQRT2: {
1204                         /* The SFU instructions write directly to the
1205                          * phys regfile.
1206                          */
1207                         assert(inst->dst.file == QFILE_TEMP);
1208                         set_temp_class_bits(c, inst->dst.index,
1209                                             CLASS_BITS_PHYS);
1210                         break;
1211                 }
1212 
1213                 default:
1214                         break;
1215                 }
1216         }
1217 
1218         if (inst->src[0].file == QFILE_REG) {
1219                 switch (inst->src[0].index) {
1220                 case 0:
1221                         /* V3D 7.x doesn't use rf0 for thread payload */
1222                         if (c->devinfo->ver >= 71)
1223                                 break;
1224                         else
1225                                 FALLTHROUGH;
1226                 case 1:
1227                 case 2:
1228                 case 3: {
1229                         /* Payload setup instructions: Force allocate
1230                          * the dst to the given register (so the MOV
1231                          * will disappear).
1232                          */
1233                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
1234                         assert(inst->dst.file == QFILE_TEMP);
1235                         uint32_t node = temp_to_node(c, inst->dst.index);
1236                         ra_set_node_reg(c->g, node,
1237                                         get_phys_index(c->devinfo) +
1238                                         inst->src[0].index);
1239                         break;
1240                 }
1241                 }
1242         }
1243 
1244         /* Don't allocate rf0 to temps that cross ranges where we have
1245          * live implicit rf0 writes from ldvary. We can identify these
1246          * by tracking the last ldvary instruction and explicit reads
1247          * of rf0.
1248          */
1249         if (c->devinfo->ver >= 71 &&
1250             ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
1251               (vir_get_nsrc(inst) > 1 &&
1252                inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
1253                 for (int i = 0; i < c->num_temps; i++) {
1254                         if (c->temp_start[i] < ip &&
1255                             c->temp_end[i] > last_ldvary_ip) {
1256                                         ra_add_node_interference(c->g,
1257                                                                  temp_to_node(c, i),
1258                                                                  implicit_rf_nodes[0]);
1259                         }
1260                 }
1261         }
1262 
1263         /* Spill setup instructions are the only ones that we emit before
1264          * reading payload registers so we want to flag their temps so we
1265          * don't assign them to payload registers and stomp them before we
1266          * can read them. For the case where we may have emitted spill setup
1267          * before RA (i.e. for scratch), we need to do this now.
1268          */
1269         if (c->spill_size > 0 && has_payload && inst_reads_payload(c, inst)) {
1270                 struct qblock *first_block = vir_entry_block(c);
1271                 list_for_each_entry_from_rev(struct qinst, _i, inst->link.prev,
1272                                              &first_block->instructions, link) {
1273                         if (_i->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1274                                 continue;
1275                         if (_i->dst.file == QFILE_TEMP) {
1276                                 int node = temp_to_node(c, _i->dst.index);
1277                                 c->nodes.info[node].payload_conflict = true;
1278                         }
1279                         if (_i->src[0].file == QFILE_TEMP) {
1280                                 int node = temp_to_node(c, _i->src[0].index);
1281                                 c->nodes.info[node].payload_conflict = true;
1282                         }
1283                         if (vir_get_nsrc(_i) > 1 && _i->src[1].file == QFILE_TEMP) {
1284                                 int node = temp_to_node(c, _i->src[1].index);
1285                                 c->nodes.info[node].payload_conflict = true;
1286                         }
1287                 }
1288         }
1289 
1290         if (inst->dst.file == QFILE_TEMP) {
1291                 /* Only a ldunif gets to write to R5, which only has a single
1292                  * 32-bit channel of storage. Disallow R5 if we are around
1293                  * ldvary sequences, since ldvary writes that register too and
1294                  * that would disallow pairing.
1295                  *
1296                  * NOTE: ldunifa is subject to the same, however, going by
1297                  * shader-db it is best to keep r5 exclusive to ldunif, probably
1298                  * because ldunif has usually a shorter lifespan, allowing for
1299                  * more accumulator reuse and QPU merges.
1300                  */
1301                 if (c->devinfo->has_accumulators) {
1302                         if (!inst->qpu.sig.ldunif ||
1303                             (c->s->info.stage == MESA_SHADER_FRAGMENT &&
1304                              ip <= last_ldvary_ip + 4)) {
1305                                 uint8_t class_bits =
1306                                         get_temp_class_bits(c, inst->dst.index) &
1307                                         ~CLASS_BITS_R5;
1308                                 set_temp_class_bits(c, inst->dst.index,
1309                                                     class_bits);
1310 
1311                         }
1312                 } else {
1313                         /* Make sure we don't allocate the ldvary's
1314                          * destination to rf0, since it would clash
1315                          * with its implicit write to that register.
1316                          */
1317                         if (inst->qpu.sig.ldvary) {
1318                                 ra_add_node_interference(c->g,
1319                                                          temp_to_node(c, inst->dst.index),
1320                                                          implicit_rf_nodes[0]);
1321                         }
1322                         /* Flag dst temps from ldunif(a) instructions so we can
1323                          * try to assign rf0 to them and avoid converting these
1324                          * to ldunif(a)rf, however, we don't want to do this
1325                          * when these instructions are nearby ldvary since these
1326                          * have implicit writes to rf0 and that would hurt
1327                          * pairing.
1328                          */
1329                         if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) &&
1330                             (c->s->info.stage != MESA_SHADER_FRAGMENT ||
1331                              ip > last_ldvary_ip + 4)) {
1332                                 const uint32_t dst_n =
1333                                         temp_to_node(c, inst->dst.index);
1334                                 c->nodes.info[dst_n].try_rf0 = true;
1335                         }
1336                 }
1337         }
1338 
1339         /* All accumulators are invalidated across a thread switch. */
1340         if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
1341                 for (int i = 0; i < c->num_temps; i++) {
1342                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1343                                 set_temp_class_bits(c, i,
1344                                                     CLASS_BITS_PHYS);
1345                         }
1346                 }
1347         }
1348 }
1349 
1350 static void
flag_program_end_nodes(struct v3d_compile * c)1351 flag_program_end_nodes(struct v3d_compile *c)
1352 {
1353         /* Only look for registers used in this many instructions */
1354         uint32_t last_set_count = 6;
1355 
1356         struct qblock *last_block = vir_exit_block(c);
1357         list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
1358                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1359                         continue;
1360 
1361                 int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
1362                 for (int i = 0; i < num_src; i++) {
1363                         if (inst->src[i].file == QFILE_TEMP) {
1364                                 int node = temp_to_node(c, inst->src[i].index);
1365                                 c->nodes.info[node].is_program_end = true;
1366                         }
1367                 }
1368 
1369                 num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
1370                 for (int i = 0; i < num_src; i++) {
1371                        if (inst->src[i].file == QFILE_TEMP) {
1372                                 int node = temp_to_node(c, inst->src[i].index);
1373                                 c->nodes.info[node].is_program_end = true;
1374 
1375                         }
1376                 }
1377 
1378                 if (inst->dst.file == QFILE_TEMP) {
1379                         int node = temp_to_node(c, inst->dst.index);
1380                         c->nodes.info[node].is_program_end = true;
1381                 }
1382 
1383                 if (--last_set_count == 0)
1384                         break;
1385         }
1386 }
1387 
1388 /**
1389  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
1390  *
1391  * The return value should be freed by the caller.
1392  */
1393 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c)1394 v3d_register_allocate(struct v3d_compile *c)
1395 {
1396         int acc_nodes[ACC_COUNT];
1397         int implicit_rf_nodes[IMPLICIT_RF_COUNT];
1398 
1399         unsigned num_ra_nodes = c->num_temps;
1400         if (c->devinfo->has_accumulators)
1401                 num_ra_nodes += ARRAY_SIZE(acc_nodes);
1402         else
1403                 num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
1404 
1405         c->nodes = (struct v3d_ra_node_info) {
1406                 .alloc_count = c->num_temps,
1407                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
1408                                           num_ra_nodes),
1409         };
1410 
1411         uint32_t phys_index = get_phys_index(c->devinfo);
1412 
1413         struct v3d_ra_select_callback_data callback_data = {
1414                 .phys_index = phys_index,
1415                 .next_acc = 0,
1416                 /* Start at RF3, to try to keep the TLB writes from using
1417                  * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
1418                  * using RF2-3.
1419                  */
1420                 .next_phys = c->devinfo->ver == 42 ? 3 : 4,
1421                 .nodes = &c->nodes,
1422                 .devinfo = c->devinfo,
1423         };
1424 
1425         vir_calculate_live_intervals(c);
1426 
1427         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
1428          *
1429          * V3D 4.x has double the physical register space, so 64 physical regs
1430          * are available at both 1x and 2x threading, and 4x has 32.
1431          */
1432         c->thread_index = ffs(c->threads) - 1;
1433         if (c->thread_index >= 1)
1434                 c->thread_index--;
1435 
1436         c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
1437         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
1438 
1439         /* Make some fixed nodes for the accumulators, which we will need to
1440          * interfere with when ops have implied r3/r4 writes or for the thread
1441          * switches.  We could represent these as classes for the nodes to
1442          * live in, but the classes take up a lot of memory to set up, so we
1443          * don't want to make too many. We use the same mechanism on platforms
1444          * without accumulators that can have implicit writes to phys regs.
1445          */
1446         for (uint32_t i = 0; i < num_ra_nodes; i++) {
1447                 c->nodes.info[i].try_rf0 = false;
1448                 c->nodes.info[i].is_program_end = false;
1449                 c->nodes.info[i].unused = false;
1450                 c->nodes.info[i].priority = 0;
1451                 c->nodes.info[i].class_bits = 0;
1452                 c->nodes.info[i].payload_conflict = false;
1453                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
1454                         acc_nodes[i] = i;
1455                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
1456                 } else if (!c->devinfo->has_accumulators &&
1457                            i < ARRAY_SIZE(implicit_rf_nodes)) {
1458                         implicit_rf_nodes[i] = i;
1459                         ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
1460                 } else {
1461                         uint32_t t = node_to_temp(c, i);
1462                         c->nodes.info[i].priority =
1463                                 c->temp_end[t] - c->temp_start[t];
1464                         c->nodes.info[i].class_bits =
1465                                 get_class_bit_any(c->devinfo);
1466                 }
1467         }
1468 
1469         /* Walk the instructions adding register class restrictions and
1470          * interferences.
1471          */
1472         int ip = 0;
1473         int last_ldvary_ip = -1;
1474         bool has_payload = stage_has_payload(c);
1475         vir_for_each_inst_inorder(inst, c) {
1476                 inst->ip = ip++;
1477 
1478                 /* ldunif(a) always write to a temporary, so we have
1479                  * liveness info available to decide if rf0 is
1480                  * available for them, however, ldvary is different:
1481                  * it always writes to rf0 directly so we don't have
1482                  * liveness information for its implicit rf0 write.
1483                  *
1484                  * That means the allocator may assign rf0 to a temp
1485                  * that is defined while an implicit rf0 write from
1486                  * ldvary is still live. We fix that by manually
1487                  * tracking rf0 live ranges from ldvary instructions.
1488                  */
1489                 if (inst->qpu.sig.ldvary)
1490                         last_ldvary_ip = ip;
1491 
1492                 update_graph_and_reg_classes_for_inst(c, acc_nodes,
1493                                                       implicit_rf_nodes,
1494                                                       last_ldvary_ip,
1495                                                       has_payload,
1496                                                       inst);
1497         }
1498 
1499         /* Flag the nodes that are used in the last instructions of the program
1500          * (there are some registers that cannot be used in the last 3
1501          * instructions). We only do this for fragment shaders, because the idea
1502          * is that by avoiding this conflict we may be able to emit the last
1503          * thread switch earlier in some cases, however, in non-fragment shaders
1504          * this won't happen because the last instructions are always VPM stores
1505          * with a small immediate, which conflicts with other signals,
1506          * preventing us from ever moving the thrsw earlier.
1507          */
1508         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1509                 flag_program_end_nodes(c);
1510 
1511         /* Set the register classes for all our temporaries in the graph */
1512         for (uint32_t i = 0; i < c->num_temps; i++) {
1513                 ra_set_node_class(c->g, temp_to_node(c, i),
1514                                   choose_reg_class_for_temp(c, i));
1515         }
1516 
1517         /* Add register interferences based on liveness data */
1518         for (uint32_t i = 0; i < c->num_temps; i++) {
1519                 /* And while we are here, let's also flag nodes for
1520                  * unused temps.
1521                  */
1522                 if (c->temp_start[i] > c->temp_end[i])
1523                         c->nodes.info[temp_to_node(c, i)].unused = true;
1524 
1525                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
1526                         if (interferes(c->temp_start[i], c->temp_end[i],
1527                                        c->temp_start[j], c->temp_end[j])) {
1528                                 ra_add_node_interference(c->g,
1529                                                          temp_to_node(c, i),
1530                                                          temp_to_node(c, j));
1531                         }
1532                 }
1533         }
1534 
1535         /* Debug option to force a bit of TMU spilling, for running
1536          * across conformance tests to make sure that spilling works.
1537          */
1538         const int force_register_spills = 0;
1539         if (force_register_spills > 0)
1540                 c->max_tmu_spills = UINT32_MAX;
1541 
1542         struct qpu_reg *temp_registers = NULL;
1543         while (true) {
1544                 if (c->spill_size <
1545                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
1546                         int node = v3d_choose_spill_node(c);
1547                         uint32_t temp = node_to_temp(c, node);
1548                         if (node != -1) {
1549                                 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1550                                 continue;
1551                         }
1552                 }
1553 
1554                 if (ra_allocate(c->g))
1555                         break;
1556 
1557                 /* Failed allocation, try to spill */
1558                 int node = v3d_choose_spill_node(c);
1559                 if (node == -1)
1560                         goto spill_fail;
1561 
1562                 uint32_t temp = node_to_temp(c, node);
1563                 enum temp_spill_type spill_type =
1564                         get_spill_type_for_temp(c, temp);
1565                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
1566                         v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1567                         if (c->spills + c->fills > c->max_tmu_spills)
1568                                 goto spill_fail;
1569                 } else {
1570                         goto spill_fail;
1571                 }
1572         }
1573 
1574         /* Allocation was successful, build the 'temp -> reg' map */
1575         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
1576         for (uint32_t i = 0; i < c->num_temps; i++) {
1577                 int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
1578                 if (ra_reg < phys_index) {
1579                         temp_registers[i].magic = true;
1580                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
1581                                                    ra_reg - ACC_INDEX);
1582                 } else {
1583                         temp_registers[i].magic = false;
1584                         temp_registers[i].index = ra_reg - phys_index;
1585                 }
1586         }
1587 
1588 spill_fail:
1589         ralloc_free(c->nodes.info);
1590         c->nodes.info = NULL;
1591         c->nodes.alloc_count = 0;
1592         ralloc_free(c->g);
1593         c->g = NULL;
1594         return temp_registers;
1595 }
1596