• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28 
29 #define ACC_INDEX     0
30 #define ACC_COUNT     6
31 
32 /* RA nodes used to track RF registers with implicit writes */
33 #define IMPLICIT_RF_COUNT 1
34 
35 #define PHYS_COUNT 64
36 
37 static uint8_t
get_phys_index(const struct v3d_device_info * devinfo)38 get_phys_index(const struct v3d_device_info *devinfo)
39 {
40         if (devinfo->has_accumulators)
41                 return ACC_INDEX + ACC_COUNT;
42         else
43                 return 0;
44 }
45 
46 /* ACC as accumulator */
47 #define CLASS_BITS_PHYS   (1 << 0)
48 #define CLASS_BITS_ACC    (1 << 1)
49 #define CLASS_BITS_R5     (1 << 4)
50 
51 static uint8_t
get_class_bit_any(const struct v3d_device_info * devinfo)52 get_class_bit_any(const struct v3d_device_info *devinfo)
53 {
54         if (devinfo->has_accumulators)
55                 return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
56         else
57                 return CLASS_BITS_PHYS;
58 }
59 
60 static uint8_t
filter_class_bits(const struct v3d_device_info * devinfo,uint8_t class_bits)61 filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
62 {
63    if (!devinfo->has_accumulators) {
64       assert(class_bits & CLASS_BITS_PHYS);
65       class_bits = CLASS_BITS_PHYS;
66    }
67    return class_bits;
68 }
69 
70 static inline uint32_t
temp_to_node(struct v3d_compile * c,uint32_t temp)71 temp_to_node(struct v3d_compile *c, uint32_t temp)
72 {
73         return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
74                                                       IMPLICIT_RF_COUNT);
75 }
76 
77 static inline uint32_t
node_to_temp(struct v3d_compile * c,uint32_t node)78 node_to_temp(struct v3d_compile *c, uint32_t node)
79 {
80         assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
81                (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
82         return node - (c->devinfo->has_accumulators ? ACC_COUNT :
83                                                       IMPLICIT_RF_COUNT);
84 }
85 
86 static inline uint8_t
get_temp_class_bits(struct v3d_compile * c,uint32_t temp)87 get_temp_class_bits(struct v3d_compile *c,
88                     uint32_t temp)
89 {
90         return c->nodes.info[temp_to_node(c, temp)].class_bits;
91 }
92 
93 static inline void
set_temp_class_bits(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)94 set_temp_class_bits(struct v3d_compile *c,
95                     uint32_t temp, uint8_t class_bits)
96 {
97         c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
98 }
99 
100 static struct ra_class *
choose_reg_class(struct v3d_compile * c,uint8_t class_bits)101 choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
102 {
103         if (class_bits == CLASS_BITS_PHYS) {
104                 return c->compiler->reg_class_phys[c->thread_index];
105         } else if (class_bits == (CLASS_BITS_R5)) {
106                 assert(c->devinfo->has_accumulators);
107                 return c->compiler->reg_class_r5[c->thread_index];
108         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
109                 assert(c->devinfo->has_accumulators);
110                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
111         } else {
112                 assert(class_bits == get_class_bit_any(c->devinfo));
113                 return c->compiler->reg_class_any[c->thread_index];
114         }
115 }
116 
117 static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile * c,uint32_t temp)118 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
119 {
120         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
121         return choose_reg_class(c, get_temp_class_bits(c, temp));
122 }
123 
124 static inline bool
qinst_writes_tmu(const struct v3d_device_info * devinfo,struct qinst * inst)125 qinst_writes_tmu(const struct v3d_device_info *devinfo,
126                  struct qinst *inst)
127 {
128         return (inst->dst.file == QFILE_MAGIC &&
129                 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||
130                 inst->qpu.sig.wrtmuc;
131 }
132 
133 static bool
is_end_of_tmu_sequence(const struct v3d_device_info * devinfo,struct qinst * inst,struct qblock * block)134 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
135                        struct qinst *inst, struct qblock *block)
136 {
137         /* Only tmuwt and ldtmu can finish TMU sequences */
138         bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
139                         inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
140         bool is_ldtmu = inst->qpu.sig.ldtmu;
141         if (!is_tmuwt && !is_ldtmu)
142                 return false;
143 
144         /* Check if this is the last tmuwt or ldtmu in the sequence */
145         list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
146                                  &block->instructions, link) {
147                 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
148                            scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
149                 is_ldtmu = scan_inst->qpu.sig.ldtmu;
150 
151                 if (is_tmuwt || is_ldtmu)
152                         return false;
153 
154                 if (qinst_writes_tmu(devinfo, scan_inst))
155                         return true;
156         }
157 
158         return true;
159 }
160 
161 static bool
vir_is_mov_uniform(struct v3d_compile * c,int temp)162 vir_is_mov_uniform(struct v3d_compile *c, int temp)
163 {
164         struct qinst *def = c->defs[temp];
165 
166         return def && def->qpu.sig.ldunif;
167 }
168 
169 static bool
can_reconstruct_inst(struct qinst * inst)170 can_reconstruct_inst(struct qinst *inst)
171 {
172         assert(inst);
173 
174         if (vir_is_add(inst)) {
175                 switch (inst->qpu.alu.add.op) {
176                 case V3D_QPU_A_FXCD:
177                 case V3D_QPU_A_FYCD:
178                 case V3D_QPU_A_XCD:
179                 case V3D_QPU_A_YCD:
180                 case V3D_QPU_A_IID:
181                 case V3D_QPU_A_EIDX:
182                 case V3D_QPU_A_TIDX:
183                 case V3D_QPU_A_SAMPID:
184                         /* No need to check input unpacks because none of these
185                          * opcodes read sources. FXCD,FYCD have pack variants.
186                          */
187                         return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
188                                inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
189                                inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
190                                inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
191                 default:
192                         return false;
193                 }
194         }
195 
196         return false;
197 }
198 
199 static bool
can_reconstruct_temp(struct v3d_compile * c,int temp)200 can_reconstruct_temp(struct v3d_compile *c, int temp)
201 {
202         struct qinst *def = c->defs[temp];
203         return def && can_reconstruct_inst(def);
204 }
205 
206 static struct qreg
reconstruct_temp(struct v3d_compile * c,enum v3d_qpu_add_op op)207 reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
208 {
209         struct qreg dest;
210         switch (op) {
211         case V3D_QPU_A_FXCD:
212                 dest = vir_FXCD(c);
213                 break;
214         case V3D_QPU_A_FYCD:
215                 dest = vir_FYCD(c);
216                 break;
217         case V3D_QPU_A_XCD:
218                 dest = vir_XCD(c);
219                 break;
220         case V3D_QPU_A_YCD:
221                 dest = vir_YCD(c);
222                 break;
223         case V3D_QPU_A_IID:
224                 dest = vir_IID(c);
225                 break;
226         case V3D_QPU_A_EIDX:
227                 dest = vir_EIDX(c);
228                 break;
229         case V3D_QPU_A_TIDX:
230                 dest = vir_TIDX(c);
231                 break;
232         case V3D_QPU_A_SAMPID:
233                 dest = vir_SAMPID(c);
234                 break;
235         default:
236             unreachable("Unexpected opcode for reconstruction");
237         }
238 
239         return dest;
240 }
241 
242 enum temp_spill_type {
243         SPILL_TYPE_UNIFORM,
244         SPILL_TYPE_RECONSTRUCT,
245         SPILL_TYPE_TMU
246 };
247 
248 static enum temp_spill_type
get_spill_type_for_temp(struct v3d_compile * c,int temp)249 get_spill_type_for_temp(struct v3d_compile *c, int temp)
250 {
251    if (vir_is_mov_uniform(c, temp))
252       return SPILL_TYPE_UNIFORM;
253 
254    if (can_reconstruct_temp(c, temp))
255       return SPILL_TYPE_RECONSTRUCT;
256 
257    return SPILL_TYPE_TMU;
258 }
259 
260 static int
v3d_choose_spill_node(struct v3d_compile * c)261 v3d_choose_spill_node(struct v3d_compile *c)
262 {
263         const float tmu_scale = 10;
264         float block_scale = 1.0;
265         float spill_costs[c->num_temps];
266         bool in_tmu_operation = false;
267         bool started_last_seg = false;
268 
269         for (unsigned i = 0; i < c->num_temps; i++)
270                 spill_costs[i] = 0.0;
271 
272         /* XXX: Scale the cost up when inside of a loop. */
273         vir_for_each_block(block, c) {
274                 vir_for_each_inst(inst, block) {
275                         /* We can't insert new thread switches after
276                          * starting output writes.
277                          */
278                         bool no_spilling =
279                                 (c->threads > 1 && started_last_seg) ||
280                                 (c->max_tmu_spills == 0);
281 
282                         /* Discourage spilling of TMU operations */
283                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
284                                 if (inst->src[i].file != QFILE_TEMP)
285                                         continue;
286 
287                                 int temp = inst->src[i].index;
288                                 enum temp_spill_type spill_type =
289                                         get_spill_type_for_temp(c, temp);
290 
291                                 if (spill_type != SPILL_TYPE_TMU) {
292                                         spill_costs[temp] += block_scale;
293                                 } else if (!no_spilling) {
294                                         float tmu_op_scale = in_tmu_operation ?
295                                                 3.0 : 1.0;
296                                         spill_costs[temp] += (block_scale *
297                                                               tmu_scale *
298                                                               tmu_op_scale);
299                                 } else {
300                                         BITSET_CLEAR(c->spillable, temp);
301                                 }
302                         }
303 
304                         if (inst->dst.file == QFILE_TEMP) {
305                                 int temp = inst->dst.index;
306                                 enum temp_spill_type spill_type =
307                                         get_spill_type_for_temp(c, temp);
308 
309                                 if (spill_type != SPILL_TYPE_TMU) {
310                                         /* We just rematerialize it later */
311                                 } else if (!no_spilling) {
312                                         spill_costs[temp] += (block_scale *
313                                                               tmu_scale);
314                                 } else {
315                                         BITSET_CLEAR(c->spillable, temp);
316                                 }
317                         }
318 
319                         /* Refuse to spill a ldvary's dst, because that means
320                          * that ldvary's r5 would end up being used across a
321                          * thrsw.
322                          */
323                         if (inst->qpu.sig.ldvary) {
324                                 assert(inst->dst.file == QFILE_TEMP);
325                                 BITSET_CLEAR(c->spillable, inst->dst.index);
326                         }
327 
328                         if (inst->is_last_thrsw)
329                                 started_last_seg = true;
330 
331                         /* Track when we're in between a TMU setup and the
332                          * final LDTMU or TMUWT from that TMU setup.  We
333                          * penalize spills during that time.
334                          */
335                         if (is_end_of_tmu_sequence(c->devinfo, inst, block))
336                                 in_tmu_operation = false;
337 
338                         if (qinst_writes_tmu(c->devinfo, inst))
339                                 in_tmu_operation = true;
340                 }
341         }
342 
343         /* We always emit a "last thrsw" to ensure all our spilling occurs
344          * before the last thread section. See vir_emit_last_thrsw.
345          */
346         assert(started_last_seg);
347 
348         for (unsigned i = 0; i < c->num_temps; i++) {
349                 if (BITSET_TEST(c->spillable, i)) {
350                         ra_set_node_spill_cost(c->g, temp_to_node(c, i),
351                                                spill_costs[i]);
352                 }
353         }
354 
355         return ra_get_best_spill_node(c->g);
356 }
357 
358 static void
ensure_nodes(struct v3d_compile * c)359 ensure_nodes(struct v3d_compile *c)
360 {
361         if (c->num_temps < c->nodes.alloc_count)
362                 return;
363 
364         c->nodes.alloc_count *= 2;
365         c->nodes.info = reralloc_array_size(c,
366                                             c->nodes.info,
367                                             sizeof(c->nodes.info[0]),
368                                             c->nodes.alloc_count +
369                                             MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
370 }
371 
372 /* Creates the interference node for a new temp. We use this to keep the node
373  * list updated during the spilling process, which generates new temps/nodes.
374  */
375 static void
add_node(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)376 add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
377 {
378         ensure_nodes(c);
379 
380         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
381         assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
382                                               node == temp + IMPLICIT_RF_COUNT);
383 
384         /* We fill the node priority after we are done inserting spills */
385         c->nodes.info[node].class_bits = class_bits;
386         c->nodes.info[node].priority = 0;
387         c->nodes.info[node].is_ldunif_dst = false;
388         c->nodes.info[node].is_program_end = false;
389         c->nodes.info[node].unused = false;
390 }
391 
392 /* The spill offset for this thread takes a bit of setup, so do it once at
393  * program start.
394  */
395 void
v3d_setup_spill_base(struct v3d_compile * c)396 v3d_setup_spill_base(struct v3d_compile *c)
397 {
398         /* Setting up the spill base is done in the entry block; so change
399          * both the current block to emit and the cursor.
400          */
401         struct qblock *current_block = c->cur_block;
402         c->cur_block = vir_entry_block(c);
403         c->cursor = vir_before_block(c->cur_block);
404 
405         int start_num_temps = c->num_temps;
406 
407         /* Each thread wants to be in a separate region of the scratch space
408          * so that the QPUs aren't fighting over cache lines.  We have the
409          * driver keep a single global spill BO rather than
410          * per-spilling-program BOs, so we need a uniform from the driver for
411          * what the per-thread scale is.
412          */
413         struct qreg thread_offset =
414                 vir_UMUL(c,
415                          vir_TIDX(c),
416                          vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
417 
418         /* Each channel in a reg is 4 bytes, so scale them up by that. */
419         struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
420                                              vir_uniform_ui(c, 2));
421 
422         c->spill_base = vir_ADD(c,
423                                 vir_ADD(c, thread_offset, element_offset),
424                                 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
425 
426         /* Make sure that we don't spill the spilling setup instructions. */
427         for (int i = start_num_temps; i < c->num_temps; i++) {
428                 BITSET_CLEAR(c->spillable, i);
429 
430                 /* If we are spilling, update the RA map with the temps added
431                  * by the spill setup. Our spill_base register can never be an
432                  * accumulator because it is used for TMU spill/fill and thus
433                  * needs to persist across thread switches.
434                  */
435                 if (c->spilling) {
436                         int temp_class = CLASS_BITS_PHYS;
437                         if (c->devinfo->has_accumulators &&
438                             i != c->spill_base.index) {
439                                 temp_class |= CLASS_BITS_ACC;
440                         }
441                         add_node(c, i, temp_class);
442                 }
443         }
444 
445         /* Restore the current block. */
446         c->cur_block = current_block;
447         c->cursor = vir_after_block(c->cur_block);
448 }
449 
450 /**
451  * Computes the address for a spill/fill sequence and completes the spill/fill
452  * sequence by emitting the following code:
453  *
454  * ldunif.spill_offset
455  * add tmua spill_base spill_offset
456  * thrsw
457  *
458  * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
459  * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
460  *
461  * The parameter 'ip' represents the ip at which the spill/fill is happening.
462  * This is used to disallow accumulators on temps that cross this ip boundary
463  * due to the new thrsw itroduced in the sequence above.
464  */
465 static void
v3d_emit_spill_tmua(struct v3d_compile * c,uint32_t spill_offset,enum v3d_qpu_cond cond,int32_t ip,struct qreg * fill_dst)466 v3d_emit_spill_tmua(struct v3d_compile *c,
467                     uint32_t spill_offset,
468                     enum v3d_qpu_cond cond,
469                     int32_t ip,
470                     struct qreg *fill_dst)
471 {
472         assert(ip >= 0);
473 
474         /* Load a uniform with the spill offset and add it to the spill base
475          * to obtain the TMUA address. It can be of class ANY because we know
476          * we are consuming it immediately without thrsw in between.
477          */
478         assert(c->disable_ldunif_opt);
479         struct qreg offset = vir_uniform_ui(c, spill_offset);
480         add_node(c, offset.index, get_class_bit_any(c->devinfo));
481 
482         /* We always enable per-quad on spills/fills to ensure we spill
483          * any channels involved with helper invocations.
484          */
485         struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
486         struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
487         inst->qpu.flags.ac = cond;
488         inst->ldtmu_count = 1;
489         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
490                                               0xffffff7f); /* per-quad */
491 
492         vir_emit_thrsw(c);
493 
494         /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
495          * result of the fill. The TMUWT temp is not really read, the ldtmu
496          * temp will be used immediately so just like the uniform above we
497          * can allow accumulators.
498          */
499         int temp_class =
500                 filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
501         if (!fill_dst) {
502                 struct qreg dst = vir_TMUWT(c);
503                 assert(dst.file == QFILE_TEMP);
504                 add_node(c, dst.index, temp_class);
505         } else {
506                 *fill_dst = vir_LDTMU(c);
507                 assert(fill_dst->file == QFILE_TEMP);
508                 add_node(c, fill_dst->index, temp_class);
509         }
510 
511         /* Temps across the thread switch we injected can't be assigned to
512          * accumulators.
513          *
514          * Fills inject code before ip, so anything that starts at ip or later
515          * is not affected by the thrsw. Something that ends at ip will be
516          * affected though.
517          *
518          * Spills inject code after ip, so anything that starts strictly later
519          * than ip is not affected (the temp starting at ip is usually the
520          * spilled temp except for postponed spills). Something that ends at ip
521          * won't be affected either.
522          */
523         for (int i = 0; i < c->spill_start_num_temps; i++) {
524                 bool thrsw_cross = fill_dst ?
525                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
526                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
527                 if (thrsw_cross) {
528                         ra_set_node_class(c->g, temp_to_node(c, i),
529                                           choose_reg_class(c, CLASS_BITS_PHYS));
530                 }
531         }
532 }
533 
534 static void
v3d_emit_tmu_spill(struct v3d_compile * c,struct qinst * inst,struct qreg spill_temp,struct qinst * position,uint32_t ip,uint32_t spill_offset)535 v3d_emit_tmu_spill(struct v3d_compile *c,
536                    struct qinst *inst,
537                    struct qreg spill_temp,
538                    struct qinst *position,
539                    uint32_t ip,
540                    uint32_t spill_offset)
541 {
542         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
543         assert(inst->dst.file == QFILE_TEMP);
544 
545         c->cursor = vir_after_inst(position);
546 
547         enum v3d_qpu_cond cond = vir_get_cond(inst);
548 
549         /* If inst and position don't match, this is a postponed spill,
550          * in which case we have already allocated the temp for the spill
551          * and we should use that, otherwise create a new temp with the
552          * same register class bits as the original.
553          */
554         if (inst == position) {
555                 uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
556                 inst->dst = vir_get_temp(c);
557                 add_node(c, inst->dst.index, class_bits);
558         } else {
559                 inst->dst = spill_temp;
560 
561                 /* If this is a postponed spill the register being spilled may
562                  * have been written more than once including conditional
563                  * writes, so ignore predication on the spill instruction and
564                  * always spill the full register.
565                  */
566                 cond = V3D_QPU_COND_NONE;
567         }
568 
569         struct qinst *tmp =
570                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
571                              inst->dst);
572         tmp->qpu.flags.mc = cond;
573 
574         v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
575 
576         c->spills++;
577         c->tmu_dirty_rcl = true;
578 }
579 
580 static inline bool
interferes(int32_t t0_start,int32_t t0_end,int32_t t1_start,int32_t t1_end)581 interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
582 {
583         return !(t0_start >= t1_end || t1_start >= t0_end);
584 }
585 
586 static void
v3d_spill_reg(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int spill_temp)587 v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
588               int spill_temp)
589 {
590         c->spill_start_num_temps = c->num_temps;
591         c->spilling = true;
592 
593         enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
594 
595         uint32_t spill_offset = 0;
596         if (spill_type == SPILL_TYPE_TMU) {
597                 spill_offset = c->spill_size;
598                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
599 
600                 if (spill_offset == 0) {
601                         v3d_setup_spill_base(c);
602 
603                         /* Don't allocate our spill base to rf0 to avoid
604                          * conflicts with instructions doing implicit writes
605                          * to that register.
606                          */
607                         if (!c->devinfo->has_accumulators) {
608                                 ra_add_node_interference(
609                                         c->g,
610                                         temp_to_node(c, c->spill_base.index),
611                                         implicit_rf_nodes[0]);
612                         }
613                 }
614         }
615 
616         struct qinst *last_thrsw = c->last_thrsw;
617         assert(last_thrsw && last_thrsw->is_last_thrsw);
618 
619         int uniform_index = ~0;
620         if (spill_type == SPILL_TYPE_UNIFORM) {
621                 struct qinst *orig_unif = c->defs[spill_temp];
622                 uniform_index = orig_unif->uniform;
623         }
624 
625         enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
626         if (spill_type == SPILL_TYPE_RECONSTRUCT) {
627                 struct qinst *orig_def = c->defs[spill_temp];
628                 assert(vir_is_add(orig_def));
629                 reconstruct_op = orig_def->qpu.alu.add.op;
630         }
631 
632         uint32_t spill_node = temp_to_node(c, spill_temp);
633 
634         /* We must disable the ldunif optimization if we are spilling uniforms */
635         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
636         c->disable_ldunif_opt = true;
637 
638         struct qinst *start_of_tmu_sequence = NULL;
639         struct qinst *postponed_spill = NULL;
640         struct qreg postponed_spill_temp = { 0 };
641         vir_for_each_block(block, c) {
642                 vir_for_each_inst_safe(inst, block) {
643                         int32_t ip = inst->ip;
644 
645                         /* Track when we're in between a TMU setup and the final
646                          * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
647                          * temps during that time, because that involves inserting a
648                          * new TMU setup/LDTMU sequence, so we postpone the spill or
649                          * move the fill up to not intrude in the middle of the TMU
650                          * sequence.
651                          */
652                         if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
653                                 if (postponed_spill) {
654                                         v3d_emit_tmu_spill(c, postponed_spill,
655                                                            postponed_spill_temp,
656                                                            inst, ip, spill_offset);
657                                 }
658 
659                                 start_of_tmu_sequence = NULL;
660                                 postponed_spill = NULL;
661                         }
662 
663                         if (!start_of_tmu_sequence &&
664                             qinst_writes_tmu(c->devinfo, inst)) {
665                                 start_of_tmu_sequence = inst;
666                         }
667 
668                         /* fills */
669                         int filled_src = -1;
670                         for (int i = 0; i < vir_get_nsrc(inst); i++) {
671                                 if (inst->src[i].file != QFILE_TEMP ||
672                                     inst->src[i].index != spill_temp) {
673                                         continue;
674                                 }
675 
676                                 if (filled_src >= 0) {
677                                         inst->src[i] = inst->src[filled_src];
678                                         continue;
679                                 }
680 
681                                 c->cursor = vir_before_inst(inst);
682 
683                                 if (spill_type == SPILL_TYPE_UNIFORM) {
684                                         struct qreg unif =
685                                                 vir_uniform(c,
686                                                             c->uniform_contents[uniform_index],
687                                                             c->uniform_data[uniform_index]);
688                                         inst->src[i] = unif;
689                                         /* We are using the uniform in the
690                                          * instruction immediately after, so
691                                          * we can use any register class for it.
692                                          */
693                                         add_node(c, unif.index,
694                                                  get_class_bit_any(c->devinfo));
695                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
696                                         struct qreg temp =
697                                                 reconstruct_temp(c, reconstruct_op);
698                                         inst->src[i] = temp;
699                                         /* We are using the temp in the
700                                          * instruction immediately after so we
701                                          * can use ACC.
702                                          */
703                                         int temp_class =
704                                                 filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
705                                                                               CLASS_BITS_ACC);
706                                         add_node(c, temp.index, temp_class);
707                                 } else {
708                                         /* If we have a postponed spill, we
709                                          * don't need a fill as the temp would
710                                          * not have been spilled yet, however,
711                                          * we need to update the temp index.
712                                          */
713                                         if (postponed_spill) {
714                                                 inst->src[i] =
715                                                         postponed_spill_temp;
716                                         } else {
717                                                 int32_t fill_ip = ip;
718                                                 if (start_of_tmu_sequence) {
719                                                         c->cursor = vir_before_inst(start_of_tmu_sequence);
720                                                         fill_ip = start_of_tmu_sequence->ip;
721                                                 }
722 
723                                                 v3d_emit_spill_tmua(c,  spill_offset,
724                                                                     V3D_QPU_COND_NONE,
725                                                                     fill_ip, &inst->src[i]);
726                                                 c->fills++;
727                                         }
728                                 }
729 
730                                 filled_src = i;
731                         }
732 
733                         /* spills */
734                         if (inst->dst.file == QFILE_TEMP &&
735                             inst->dst.index == spill_temp) {
736                                 if (spill_type != SPILL_TYPE_TMU) {
737                                         c->cursor.link = NULL;
738                                         vir_remove_instruction(c, inst);
739                                 } else {
740                                         /* If we are in the middle of a TMU
741                                          * sequence, we postpone the actual
742                                          * spill until we have finished it. We,
743                                          * still need to replace the spill temp
744                                          * with a new temp though.
745                                          */
746                                         if (start_of_tmu_sequence) {
747                                                 if (postponed_spill) {
748                                                         postponed_spill->dst =
749                                                                 postponed_spill_temp;
750                                                 }
751                                                 if (!postponed_spill ||
752                                                     vir_get_cond(inst) == V3D_QPU_COND_NONE) {
753                                                         postponed_spill_temp =
754                                                                 vir_get_temp(c);
755                                                         add_node(c,
756                                                                  postponed_spill_temp.index,
757                                                                  c->nodes.info[spill_node].class_bits);
758                                                 }
759                                                 postponed_spill = inst;
760                                         } else {
761                                                 v3d_emit_tmu_spill(c, inst,
762                                                                    postponed_spill_temp,
763                                                                    inst, ip,
764                                                                    spill_offset);
765                                         }
766                                 }
767                         }
768                 }
769         }
770 
771         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
772          * inserted in our most recent unspill.
773          */
774         c->last_thrsw = last_thrsw;
775 
776         /* Don't allow spilling of our spilling instructions.  There's no way
777          * they can help get things colored.
778          */
779         for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
780                 BITSET_CLEAR(c->spillable, i);
781 
782         /* Reset interference for spilled node */
783         ra_set_node_spill_cost(c->g, spill_node, 0);
784         ra_reset_node_interference(c->g, spill_node);
785         BITSET_CLEAR(c->spillable, spill_temp);
786 
787         /* Rebuild program ips */
788         int32_t ip = 0;
789         vir_for_each_inst_inorder(inst, c)
790                 inst->ip = ip++;
791 
792         /* Rebuild liveness */
793         vir_calculate_live_intervals(c);
794 
795         /* Add interferences for the new spilled temps and update interferences
796          * for c->spill_base (since we may have modified its liveness). Also,
797          * update node priorities based one new liveness data.
798          */
799         uint32_t sb_temp =c->spill_base.index;
800         uint32_t sb_node = temp_to_node(c, sb_temp);
801         for (uint32_t i = 0; i < c->num_temps; i++) {
802                 if (c->temp_end[i] == -1)
803                         continue;
804 
805                 uint32_t node_i = temp_to_node(c, i);
806                 c->nodes.info[node_i].priority =
807                         c->temp_end[i] - c->temp_start[i];
808 
809                 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
810                      j < c->num_temps; j++) {
811                         if (interferes(c->temp_start[i], c->temp_end[i],
812                                        c->temp_start[j], c->temp_end[j])) {
813                                 uint32_t node_j = temp_to_node(c, j);
814                                 ra_add_node_interference(c->g, node_i, node_j);
815                         }
816                 }
817 
818                 if (spill_type == SPILL_TYPE_TMU) {
819                         if (i != sb_temp &&
820                             interferes(c->temp_start[i], c->temp_end[i],
821                                        c->temp_start[sb_temp], c->temp_end[sb_temp])) {
822                                 ra_add_node_interference(c->g, node_i, sb_node);
823                         }
824                 }
825         }
826 
827         c->disable_ldunif_opt = had_disable_ldunif_opt;
828         c->spilling = false;
829 }
830 
831 struct v3d_ra_select_callback_data {
832         uint32_t phys_index;
833         uint32_t next_acc;
834         uint32_t next_phys;
835         struct v3d_ra_node_info *nodes;
836         const struct v3d_device_info *devinfo;
837 };
838 
839 /* Choosing accumulators improves chances of merging QPU instructions
840  * due to these merges requiring that at most 2 rf registers are used
841  * by the add and mul instructions.
842  */
843 static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,int priority)844 v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
845                    BITSET_WORD *regs,
846                    int priority)
847 {
848         if (!v3d_ra->devinfo->has_accumulators)
849                 return false;
850 
851         /* Favor accumulators if we have less that this number of physical
852          * registers. Accumulators have more restrictions (like being
853          * invalidated through thrsw), so running out of physical registers
854          * even if we have accumulators available can lead to register
855          * allocation failures.
856          */
857         static const int available_rf_threshold = 5;
858         int available_rf = 0 ;
859         for (int i = 0; i < PHYS_COUNT; i++) {
860                 if (BITSET_TEST(regs, v3d_ra->phys_index + i))
861                         available_rf++;
862                 if (available_rf >= available_rf_threshold)
863                         break;
864         }
865         if (available_rf < available_rf_threshold)
866                 return true;
867 
868         /* Favor accumulators for short-lived temps (our priority represents
869          * liveness), to prevent long-lived temps from grabbing accumulators
870          * and preventing follow-up instructions from using them, potentially
871          * leading to large portions of the shader being unable to use
872          * accumulators and therefore merge instructions successfully.
873          */
874         static const int priority_threshold = 20;
875         if (priority <= priority_threshold)
876                 return true;
877 
878         return false;
879 }
880 
881 static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,unsigned int * out)882 v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
883                     BITSET_WORD *regs,
884                     unsigned int *out)
885 {
886         if (!v3d_ra->devinfo->has_accumulators)
887                 return false;
888 
889         /* Choose r5 for our ldunifs if possible (nobody else can load to that
890          * reg, and it keeps the QPU cond field free from being occupied by
891          * ldunifrf).
892          */
893         int r5 = ACC_INDEX + 5;
894         if (BITSET_TEST(regs, r5)) {
895                 *out = r5;
896                 return true;
897         }
898 
899         /* Round-robin through our accumulators to give post-RA instruction
900          * selection more options.
901          */
902         for (int i = 0; i < ACC_COUNT; i++) {
903                 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
904                 int acc = ACC_INDEX + acc_off;
905 
906                 if (BITSET_TEST(regs, acc)) {
907                         v3d_ra->next_acc = acc_off + 1;
908                         *out = acc;
909                         return true;
910                 }
911         }
912 
913         return false;
914 }
915 
916 static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data * v3d_ra,unsigned int node,BITSET_WORD * regs,unsigned int * out)917 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
918                  unsigned int node,
919                  BITSET_WORD *regs,
920                  unsigned int *out)
921 {
922         /* If this node is for an unused temp, ignore. */
923         if (v3d_ra->nodes->info[node].unused) {
924                 *out = 0;
925                 return true;
926         }
927 
928         /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
929          * so we can avoid turning them into ldunifrf (which uses the
930          * cond field to encode the dst and would prevent merge with
931          * instructions that use cond flags).
932          */
933         if (v3d_ra->nodes->info[node].is_ldunif_dst &&
934             BITSET_TEST(regs, v3d_ra->phys_index)) {
935                 assert(v3d_ra->devinfo->ver >= 71);
936                 *out = v3d_ra->phys_index;
937                 return true;
938         }
939 
940         /* The last 3 instructions in a shader can't use some specific registers
941          * (usually early rf registers, depends on v3d version) so try to
942          * avoid allocating these to registers used by the last instructions
943          * in the shader.
944          */
945         const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
946         if (v3d_ra->nodes->info[node].is_program_end &&
947             v3d_ra->next_phys < safe_rf_start) {
948                 v3d_ra->next_phys = safe_rf_start;
949         }
950 
951         for (int i = 0; i < PHYS_COUNT; i++) {
952                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
953 
954                 /* Try to keep rf0 available for ldunif in 7.x (see above). */
955                 if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
956                         continue;
957 
958                 int phys = v3d_ra->phys_index + phys_off;
959 
960                 if (BITSET_TEST(regs, phys)) {
961                         v3d_ra->next_phys = phys_off + 1;
962                         *out = phys;
963                         return true;
964                 }
965         }
966 
967         /* If we couldn't allocate, do try to assign rf0 if it is available. */
968         if (v3d_ra->devinfo->ver >= 71 &&
969             BITSET_TEST(regs, v3d_ra->phys_index)) {
970                 v3d_ra->next_phys = 1;
971                 *out = v3d_ra->phys_index;
972                 return true;
973         }
974 
975         return false;
976 }
977 
978 static unsigned int
v3d_ra_select_callback(unsigned int n,BITSET_WORD * regs,void * data)979 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
980 {
981         struct v3d_ra_select_callback_data *v3d_ra = data;
982 
983         unsigned int reg;
984         if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
985             v3d_ra_select_accum(v3d_ra, regs, &reg)) {
986                 return reg;
987         }
988 
989         if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
990                 return reg;
991 
992         /* If we ran out of physical registers try to assign an accumulator
993          * if we didn't favor that option earlier.
994          */
995         if (v3d_ra_select_accum(v3d_ra, regs, &reg))
996                 return reg;
997 
998         unreachable("RA must pass us at least one possible reg.");
999 }
1000 
1001 bool
vir_init_reg_sets(struct v3d_compiler * compiler)1002 vir_init_reg_sets(struct v3d_compiler *compiler)
1003 {
1004         /* Allocate up to 3 regfile classes, for the ways the physical
1005          * register file can be divided up for fragment shader threading.
1006          */
1007         int max_thread_index = 2;
1008         uint8_t phys_index = get_phys_index(compiler->devinfo);
1009 
1010         compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
1011                                           false);
1012         if (!compiler->regs)
1013                 return false;
1014 
1015         for (int threads = 0; threads < max_thread_index; threads++) {
1016                 compiler->reg_class_any[threads] =
1017                         ra_alloc_contig_reg_class(compiler->regs, 1);
1018                 if (compiler->devinfo->has_accumulators) {
1019                         compiler->reg_class_r5[threads] =
1020                                 ra_alloc_contig_reg_class(compiler->regs, 1);
1021                         compiler->reg_class_phys_or_acc[threads] =
1022                                 ra_alloc_contig_reg_class(compiler->regs, 1);
1023                 }
1024                 compiler->reg_class_phys[threads] =
1025                         ra_alloc_contig_reg_class(compiler->regs, 1);
1026 
1027                 /* Init physical regs */
1028                 for (int i = phys_index;
1029                      i < phys_index + (PHYS_COUNT >> threads); i++) {
1030                         if (compiler->devinfo->has_accumulators)
1031                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1032                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
1033                         ra_class_add_reg(compiler->reg_class_any[threads], i);
1034                 }
1035 
1036                 /* Init accumulator regs */
1037                 if (compiler->devinfo->has_accumulators) {
1038                         for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
1039                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1040                                 ra_class_add_reg(compiler->reg_class_any[threads], i);
1041                         }
1042                         /* r5 can only store a single 32-bit value, so not much can
1043                          * use it.
1044                          */
1045                         ra_class_add_reg(compiler->reg_class_r5[threads],
1046                                          ACC_INDEX + 5);
1047                         ra_class_add_reg(compiler->reg_class_any[threads],
1048                                          ACC_INDEX + 5);
1049                 }
1050         }
1051 
1052         ra_set_finalize(compiler->regs, NULL);
1053 
1054         return true;
1055 }
1056 
1057 static inline bool
tmu_spilling_allowed(struct v3d_compile * c)1058 tmu_spilling_allowed(struct v3d_compile *c)
1059 {
1060         return c->spills + c->fills < c->max_tmu_spills;
1061 }
1062 
1063 static void
update_graph_and_reg_classes_for_inst(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int last_ldvary_ip,struct qinst * inst)1064 update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
1065                                       int *acc_nodes,
1066                                       int *implicit_rf_nodes,
1067                                       int last_ldvary_ip,
1068                                       struct qinst *inst)
1069 {
1070         int32_t ip = inst->ip;
1071         assert(ip >= 0);
1072 
1073         /* If the instruction writes r4 (and optionally moves its
1074          * result to a temp), nothing else can be stored in r4 across
1075          * it.
1076          */
1077         if (vir_writes_r4_implicitly(c->devinfo, inst)) {
1078                 for (int i = 0; i < c->num_temps; i++) {
1079                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1080                                 ra_add_node_interference(c->g,
1081                                                          temp_to_node(c, i),
1082                                                          acc_nodes[4]);
1083                         }
1084                 }
1085         }
1086 
1087         /* If any instruction writes to a physical register implicitly
1088          * nothing else can write the same register across it.
1089          */
1090         if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
1091                 for (int i = 0; i < c->num_temps; i++) {
1092                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1093                                 ra_add_node_interference(c->g,
1094                                                          temp_to_node(c, i),
1095                                                          implicit_rf_nodes[0]);
1096                         }
1097                 }
1098         }
1099 
1100         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
1101                 switch (inst->qpu.alu.add.op) {
1102                 case V3D_QPU_A_LDVPMV_IN:
1103                 case V3D_QPU_A_LDVPMV_OUT:
1104                 case V3D_QPU_A_LDVPMD_IN:
1105                 case V3D_QPU_A_LDVPMD_OUT:
1106                 case V3D_QPU_A_LDVPMP:
1107                 case V3D_QPU_A_LDVPMG_IN:
1108                 case V3D_QPU_A_LDVPMG_OUT: {
1109                         /* LDVPMs only store to temps (the MA flag
1110                          * decides whether the LDVPM is in or out)
1111                          */
1112                         assert(inst->dst.file == QFILE_TEMP);
1113                         set_temp_class_bits(c, inst->dst.index,
1114                                             CLASS_BITS_PHYS);
1115                         break;
1116                 }
1117 
1118                 case V3D_QPU_A_RECIP:
1119                 case V3D_QPU_A_RSQRT:
1120                 case V3D_QPU_A_EXP:
1121                 case V3D_QPU_A_LOG:
1122                 case V3D_QPU_A_SIN:
1123                 case V3D_QPU_A_RSQRT2: {
1124                         /* The SFU instructions write directly to the
1125                          * phys regfile.
1126                          */
1127                         assert(inst->dst.file == QFILE_TEMP);
1128                         set_temp_class_bits(c, inst->dst.index,
1129                                             CLASS_BITS_PHYS);
1130                         break;
1131                 }
1132 
1133                 default:
1134                         break;
1135                 }
1136         }
1137 
1138         if (inst->src[0].file == QFILE_REG) {
1139                 switch (inst->src[0].index) {
1140                 case 0:
1141                         /* V3D 7.x doesn't use rf0 for thread payload */
1142                         if (c->devinfo->ver >= 71)
1143                                 break;
1144                         else
1145                                 FALLTHROUGH;
1146                 case 1:
1147                 case 2:
1148                 case 3: {
1149                         /* Payload setup instructions: Force allocate
1150                          * the dst to the given register (so the MOV
1151                          * will disappear).
1152                          */
1153                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
1154                         assert(inst->dst.file == QFILE_TEMP);
1155                         uint32_t node = temp_to_node(c, inst->dst.index);
1156                         ra_set_node_reg(c->g, node,
1157                                         get_phys_index(c->devinfo) +
1158                                         inst->src[0].index);
1159                         break;
1160                 }
1161                 }
1162         }
1163 
1164         /* Don't allocate rf0 to temps that cross ranges where we have
1165          * live implicit rf0 writes from ldvary. We can identify these
1166          * by tracking the last ldvary instruction and explicit reads
1167          * of rf0.
1168          */
1169         if (c->devinfo->ver >= 71 &&
1170             ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
1171               (vir_get_nsrc(inst) > 1 &&
1172                inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
1173                 for (int i = 0; i < c->num_temps; i++) {
1174                         if (c->temp_start[i] < ip &&
1175                             c->temp_end[i] > last_ldvary_ip) {
1176                                         ra_add_node_interference(c->g,
1177                                                                  temp_to_node(c, i),
1178                                                                  implicit_rf_nodes[0]);
1179                         }
1180                 }
1181         }
1182 
1183         if (inst->dst.file == QFILE_TEMP) {
1184                 /* Only a ldunif gets to write to R5, which only has a
1185                  * single 32-bit channel of storage.
1186                  *
1187                  * NOTE: ldunifa is subject to the same, however, going by
1188                  * shader-db it is best to keep r5 exclusive to ldunif, probably
1189                  * because ldunif has usually a shorter lifespan, allowing for
1190                  * more accumulator reuse and QPU merges.
1191                  */
1192                 if (c->devinfo->has_accumulators) {
1193                         if (!inst->qpu.sig.ldunif) {
1194                                 uint8_t class_bits =
1195                                         get_temp_class_bits(c, inst->dst.index) &
1196                                         ~CLASS_BITS_R5;
1197                                 set_temp_class_bits(c, inst->dst.index,
1198                                                     class_bits);
1199 
1200                         }
1201                 } else {
1202                         /* Make sure we don't allocate the ldvary's
1203                          * destination to rf0, since it would clash
1204                          * with its implicit write to that register.
1205                          */
1206                         if (inst->qpu.sig.ldvary) {
1207                                 ra_add_node_interference(c->g,
1208                                                          temp_to_node(c, inst->dst.index),
1209                                                          implicit_rf_nodes[0]);
1210                         }
1211                         /* Flag dst temps from ldunif(a) instructions
1212                          * so we can try to assign rf0 to them and avoid
1213                          * converting these to ldunif(a)rf.
1214                          */
1215                         if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
1216                                 const uint32_t dst_n =
1217                                         temp_to_node(c, inst->dst.index);
1218                                 c->nodes.info[dst_n].is_ldunif_dst = true;
1219                         }
1220                 }
1221         }
1222 
1223         /* All accumulators are invalidated across a thread switch. */
1224         if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
1225                 for (int i = 0; i < c->num_temps; i++) {
1226                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1227                                 set_temp_class_bits(c, i,
1228                                                     CLASS_BITS_PHYS);
1229                         }
1230                 }
1231         }
1232 }
1233 
1234 static void
flag_program_end_nodes(struct v3d_compile * c)1235 flag_program_end_nodes(struct v3d_compile *c)
1236 {
1237         /* Only look for registers used in this many instructions */
1238         uint32_t last_set_count = 6;
1239 
1240         struct qblock *last_block = vir_exit_block(c);
1241         list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
1242                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1243                         continue;
1244 
1245                 int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
1246                 for (int i = 0; i < num_src; i++) {
1247                         if (inst->src[i].file == QFILE_TEMP) {
1248                                 int node = temp_to_node(c, inst->src[i].index);
1249                                 c->nodes.info[node].is_program_end = true;
1250                         }
1251                 }
1252 
1253                 num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
1254                 for (int i = 0; i < num_src; i++) {
1255                        if (inst->src[i].file == QFILE_TEMP) {
1256                                 int node = temp_to_node(c, inst->src[i].index);
1257                                 c->nodes.info[node].is_program_end = true;
1258 
1259                         }
1260                 }
1261 
1262                 if (inst->dst.file == QFILE_TEMP) {
1263                         int node = temp_to_node(c, inst->dst.index);
1264                         c->nodes.info[node].is_program_end = true;
1265                 }
1266 
1267                 if (--last_set_count == 0)
1268                         break;
1269         }
1270 }
1271 
1272 /**
1273  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
1274  *
1275  * The return value should be freed by the caller.
1276  */
1277 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c)1278 v3d_register_allocate(struct v3d_compile *c)
1279 {
1280         int acc_nodes[ACC_COUNT];
1281         int implicit_rf_nodes[IMPLICIT_RF_COUNT];
1282 
1283         unsigned num_ra_nodes = c->num_temps;
1284         if (c->devinfo->has_accumulators)
1285                 num_ra_nodes += ARRAY_SIZE(acc_nodes);
1286         else
1287                 num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
1288 
1289         c->nodes = (struct v3d_ra_node_info) {
1290                 .alloc_count = c->num_temps,
1291                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
1292                                           num_ra_nodes),
1293         };
1294 
1295         uint32_t phys_index = get_phys_index(c->devinfo);
1296 
1297         struct v3d_ra_select_callback_data callback_data = {
1298                 .phys_index = phys_index,
1299                 .next_acc = 0,
1300                 /* Start at RF3, to try to keep the TLB writes from using
1301                  * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
1302                  * using RF2-3.
1303                  */
1304                 .next_phys = c->devinfo->ver == 42 ? 3 : 4,
1305                 .nodes = &c->nodes,
1306                 .devinfo = c->devinfo,
1307         };
1308 
1309         vir_calculate_live_intervals(c);
1310 
1311         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
1312          *
1313          * V3D 4.x has double the physical register space, so 64 physical regs
1314          * are available at both 1x and 2x threading, and 4x has 32.
1315          */
1316         c->thread_index = ffs(c->threads) - 1;
1317         if (c->thread_index >= 1)
1318                 c->thread_index--;
1319 
1320         c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
1321         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
1322 
1323         /* Make some fixed nodes for the accumulators, which we will need to
1324          * interfere with when ops have implied r3/r4 writes or for the thread
1325          * switches.  We could represent these as classes for the nodes to
1326          * live in, but the classes take up a lot of memory to set up, so we
1327          * don't want to make too many. We use the same mechanism on platforms
1328          * without accumulators that can have implicit writes to phys regs.
1329          */
1330         for (uint32_t i = 0; i < num_ra_nodes; i++) {
1331                 c->nodes.info[i].is_ldunif_dst = false;
1332                 c->nodes.info[i].is_program_end = false;
1333                 c->nodes.info[i].unused = false;
1334                 c->nodes.info[i].priority = 0;
1335                 c->nodes.info[i].class_bits = 0;
1336                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
1337                         acc_nodes[i] = i;
1338                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
1339                 } else if (!c->devinfo->has_accumulators &&
1340                            i < ARRAY_SIZE(implicit_rf_nodes)) {
1341                         implicit_rf_nodes[i] = i;
1342                         ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
1343                 } else {
1344                         uint32_t t = node_to_temp(c, i);
1345                         c->nodes.info[i].priority =
1346                                 c->temp_end[t] - c->temp_start[t];
1347                         c->nodes.info[i].class_bits =
1348                                 get_class_bit_any(c->devinfo);
1349                 }
1350         }
1351 
1352         /* Walk the instructions adding register class restrictions and
1353          * interferences.
1354          */
1355         int ip = 0;
1356         int last_ldvary_ip = -1;
1357         vir_for_each_inst_inorder(inst, c) {
1358                 inst->ip = ip++;
1359 
1360                 /* ldunif(a) always write to a temporary, so we have
1361                  * liveness info available to decide if rf0 is
1362                  * available for them, however, ldvary is different:
1363                  * it always writes to rf0 directly so we don't have
1364                  * liveness information for its implicit rf0 write.
1365                  *
1366                  * That means the allocator may assign rf0 to a temp
1367                  * that is defined while an implicit rf0 write from
1368                  * ldvary is still live. We fix that by manually
1369                  * tracking rf0 live ranges from ldvary instructions.
1370                  */
1371                 if (inst->qpu.sig.ldvary)
1372                         last_ldvary_ip = ip;
1373 
1374                 update_graph_and_reg_classes_for_inst(c, acc_nodes,
1375                                                       implicit_rf_nodes,
1376                                                       last_ldvary_ip, inst);
1377         }
1378 
1379         /* Flag the nodes that are used in the last instructions of the program
1380          * (there are some registers that cannot be used in the last 3
1381          * instructions). We only do this for fragment shaders, because the idea
1382          * is that by avoiding this conflict we may be able to emit the last
1383          * thread switch earlier in some cases, however, in non-fragment shaders
1384          * this won't happen because the last instructions are always VPM stores
1385          * with a small immediate, which conflicts with other signals,
1386          * preventing us from ever moving the thrsw earlier.
1387          */
1388         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1389                 flag_program_end_nodes(c);
1390 
1391         /* Set the register classes for all our temporaries in the graph */
1392         for (uint32_t i = 0; i < c->num_temps; i++) {
1393                 ra_set_node_class(c->g, temp_to_node(c, i),
1394                                   choose_reg_class_for_temp(c, i));
1395         }
1396 
1397         /* Add register interferences based on liveness data */
1398         for (uint32_t i = 0; i < c->num_temps; i++) {
1399                 /* And while we are here, let's also flag nodes for
1400                  * unused temps.
1401                  */
1402                 if (c->temp_start[i] > c->temp_end[i])
1403                         c->nodes.info[temp_to_node(c, i)].unused = true;
1404 
1405                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
1406                         if (interferes(c->temp_start[i], c->temp_end[i],
1407                                        c->temp_start[j], c->temp_end[j])) {
1408                                 ra_add_node_interference(c->g,
1409                                                          temp_to_node(c, i),
1410                                                          temp_to_node(c, j));
1411                         }
1412                 }
1413         }
1414 
1415         /* Debug option to force a bit of TMU spilling, for running
1416          * across conformance tests to make sure that spilling works.
1417          */
1418         const int force_register_spills = 0;
1419         if (force_register_spills > 0)
1420                 c->max_tmu_spills = UINT32_MAX;
1421 
1422         struct qpu_reg *temp_registers = NULL;
1423         while (true) {
1424                 if (c->spill_size <
1425                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
1426                         int node = v3d_choose_spill_node(c);
1427                         uint32_t temp = node_to_temp(c, node);
1428                         if (node != -1) {
1429                                 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1430                                 continue;
1431                         }
1432                 }
1433 
1434                 if (ra_allocate(c->g))
1435                         break;
1436 
1437                 /* Failed allocation, try to spill */
1438                 int node = v3d_choose_spill_node(c);
1439                 if (node == -1)
1440                         goto spill_fail;
1441 
1442                 uint32_t temp = node_to_temp(c, node);
1443                 enum temp_spill_type spill_type =
1444                         get_spill_type_for_temp(c, temp);
1445                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
1446                         v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1447                         if (c->spills + c->fills > c->max_tmu_spills)
1448                                 goto spill_fail;
1449                 } else {
1450                         goto spill_fail;
1451                 }
1452         }
1453 
1454         /* Allocation was successful, build the 'temp -> reg' map */
1455         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
1456         for (uint32_t i = 0; i < c->num_temps; i++) {
1457                 int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
1458                 if (ra_reg < phys_index) {
1459                         temp_registers[i].magic = true;
1460                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
1461                                                    ra_reg - ACC_INDEX);
1462                 } else {
1463                         temp_registers[i].magic = false;
1464                         temp_registers[i].index = ra_reg - phys_index;
1465                 }
1466         }
1467 
1468 spill_fail:
1469         ralloc_free(c->nodes.info);
1470         c->nodes.info = NULL;
1471         c->nodes.alloc_count = 0;
1472         ralloc_free(c->g);
1473         c->g = NULL;
1474         return temp_registers;
1475 }
1476