• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35 
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v42_pack.h"
42 
43 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53 
54 #define V3D_TSY_SET_QUORUM          0
55 #define V3D_TSY_INC_WAITERS         1
56 #define V3D_TSY_DEC_WAITERS         2
57 #define V3D_TSY_INC_QUORUM          3
58 #define V3D_TSY_DEC_QUORUM          4
59 #define V3D_TSY_FREE_ALL            5
60 #define V3D_TSY_RELEASE             6
61 #define V3D_TSY_ACQUIRE             7
62 #define V3D_TSY_WAIT                8
63 #define V3D_TSY_WAIT_INC            9
64 #define V3D_TSY_WAIT_CHECK          10
65 #define V3D_TSY_WAIT_INC_CHECK      11
66 #define V3D_TSY_WAIT_CV             12
67 #define V3D_TSY_INC_SEMAPHORE       13
68 #define V3D_TSY_DEC_SEMAPHORE       14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70 
71 enum v3d_tmu_op_type
72 {
73         V3D_TMU_OP_TYPE_REGULAR,
74         V3D_TMU_OP_TYPE_ATOMIC,
75         V3D_TMU_OP_TYPE_CACHE
76 };
77 
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81         switch(tmu_op) {
82         case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83         case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84         case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85         case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86         case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87                 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88         case V3D_TMU_OP_WRITE_UMAX:
89         case V3D_TMU_OP_WRITE_SMIN:
90         case V3D_TMU_OP_WRITE_SMAX:
91                 assert(is_write);
92                 FALLTHROUGH;
93         case V3D_TMU_OP_WRITE_AND_READ_INC:
94         case V3D_TMU_OP_WRITE_OR_READ_DEC:
95         case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96                 return V3D_TMU_OP_TYPE_ATOMIC;
97         case V3D_TMU_OP_REGULAR:
98                 return V3D_TMU_OP_TYPE_REGULAR;
99 
100         default:
101                 unreachable("Unknown tmu_op\n");
102         }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106 
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109                   struct qreg **regs,
110                   uint32_t *size,
111                   uint32_t decl_size)
112 {
113         if (*size >= decl_size)
114                 return;
115 
116         uint32_t old_size = *size;
117         *size = MAX2(*size * 2, decl_size);
118         *regs = reralloc(c, *regs, struct qreg, *size);
119         if (!*regs) {
120                 fprintf(stderr, "Malloc failure\n");
121                 abort();
122         }
123 
124         for (uint32_t i = old_size; i < *size; i++)
125                 (*regs)[i] = c->undef;
126 }
127 
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130                     struct v3d_interp_input **regs,
131                     uint32_t *size,
132                     uint32_t decl_size)
133 {
134         if (*size >= decl_size)
135                 return;
136 
137         uint32_t old_size = *size;
138         *size = MAX2(*size * 2, decl_size);
139         *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140         if (!*regs) {
141                 fprintf(stderr, "Malloc failure\n");
142                 abort();
143         }
144 
145         for (uint32_t i = old_size; i < *size; i++) {
146                 (*regs)[i].vp = c->undef;
147                 (*regs)[i].C = c->undef;
148         }
149 }
150 
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154         if (c->threads == 1)
155                 return;
156 
157         /* Always thread switch after each texture operation for now.
158          *
159          * We could do better by batching a bunch of texture fetches up and
160          * then doing one thread switch and collecting all their results
161          * afterward.
162          */
163         c->last_thrsw = vir_NOP(c);
164         c->last_thrsw->qpu.sig.thrsw = true;
165         c->last_thrsw_at_top_level = !c->in_control_flow;
166 
167         /* We need to lock the scoreboard before any tlb access happens. If this
168          * thread switch comes after we have emitted a tlb load, then it means
169          * that we can't lock on the last thread switch any more.
170          */
171         if (c->emitted_tlb_load)
172                 c->lock_scoreboard_on_first_thrsw = true;
173 }
174 
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178         if (nir_src_is_const(instr->src[src])) {
179                 int64_t add_val = nir_src_as_int(instr->src[src]);
180                 if (add_val == 1)
181                         return V3D_TMU_OP_WRITE_AND_READ_INC;
182                 else if (add_val == -1)
183                         return V3D_TMU_OP_WRITE_OR_READ_DEC;
184         }
185 
186         return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188 
189 static uint32_t
v3d_general_tmu_op_for_atomic(nir_intrinsic_instr * instr)190 v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
191 {
192         nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
193         switch (atomic_op) {
194         case nir_atomic_op_iadd:
195                 return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
196                         v3d_get_op_for_atomic_add(instr, 2) :
197                         v3d_get_op_for_atomic_add(instr, 1);
198         case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
199         case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
200         case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
201         case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
202         case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
203         case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
204         case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
205         case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
206         case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
207         default:                    unreachable("unknown atomic op");
208         }
209 }
210 
211 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)212 v3d_general_tmu_op(nir_intrinsic_instr *instr)
213 {
214         switch (instr->intrinsic) {
215         case nir_intrinsic_load_ssbo:
216         case nir_intrinsic_load_ubo:
217         case nir_intrinsic_load_uniform:
218         case nir_intrinsic_load_shared:
219         case nir_intrinsic_load_scratch:
220         case nir_intrinsic_load_global:
221         case nir_intrinsic_load_global_constant:
222         case nir_intrinsic_store_ssbo:
223         case nir_intrinsic_store_shared:
224         case nir_intrinsic_store_scratch:
225         case nir_intrinsic_store_global:
226                 return V3D_TMU_OP_REGULAR;
227 
228         case nir_intrinsic_ssbo_atomic:
229         case nir_intrinsic_ssbo_atomic_swap:
230         case nir_intrinsic_shared_atomic:
231         case nir_intrinsic_shared_atomic_swap:
232         case nir_intrinsic_global_atomic:
233         case nir_intrinsic_global_atomic_swap:
234                 return v3d_general_tmu_op_for_atomic(instr);
235 
236         default:
237                 unreachable("unknown intrinsic op");
238         }
239 }
240 
241 /**
242  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
243  * would overflow the Output TMU fifo.
244  *
245  * It is not allowed to overflow the Output fifo, however, we can overflow
246  * Input and Config fifos. Doing that makes the shader stall, but only for as
247  * long as it needs to be able to continue so it is better for pipelining to
248  * let the QPU stall on these if needed than trying to emit TMU flushes in the
249  * driver.
250  */
251 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)252 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
253 {
254         if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
255                 return true;
256 
257         return components > 0 &&
258                c->tmu.output_fifo_size + components > 16 / c->threads;
259 }
260 
261 /**
262  * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
263  * popping all TMU fifo entries.
264  */
265 void
ntq_flush_tmu(struct v3d_compile * c)266 ntq_flush_tmu(struct v3d_compile *c)
267 {
268         if (c->tmu.flush_count == 0)
269                 return;
270 
271         vir_emit_thrsw(c);
272 
273         bool emitted_tmuwt = false;
274         for (int i = 0; i < c->tmu.flush_count; i++) {
275                 if (c->tmu.flush[i].component_mask > 0) {
276                         nir_def *def = c->tmu.flush[i].def;
277                         assert(def);
278 
279                         for (int j = 0; j < 4; j++) {
280                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
281                                         ntq_store_def(c, def, j,
282                                                       vir_MOV(c, vir_LDTMU(c)));
283                                 }
284                         }
285                 } else if (!emitted_tmuwt) {
286                         vir_TMUWT(c);
287                         emitted_tmuwt = true;
288                 }
289         }
290 
291         c->tmu.output_fifo_size = 0;
292         c->tmu.flush_count = 0;
293         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
294 }
295 
296 /**
297  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
298  * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
299  * and more specifically, the output fifo, since that can't stall.
300  */
301 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_def * def,uint32_t component_mask)302 ntq_add_pending_tmu_flush(struct v3d_compile *c,
303                           nir_def *def,
304                           uint32_t component_mask)
305 {
306         const uint32_t num_components = util_bitcount(component_mask);
307         assert(!ntq_tmu_fifo_overflow(c, num_components));
308 
309         if (num_components > 0) {
310                 c->tmu.output_fifo_size += num_components;
311 
312                 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
313                 if (store != NULL) {
314                         nir_def *reg = store->src[1].ssa;
315                         _mesa_set_add(c->tmu.outstanding_regs, reg);
316                 }
317         }
318 
319         c->tmu.flush[c->tmu.flush_count].def = def;
320         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
321         c->tmu.flush_count++;
322         c->tmu.total_count++;
323 
324         if (c->disable_tmu_pipelining)
325                 ntq_flush_tmu(c);
326         else if (c->tmu.flush_count > 1)
327                 c->pipelined_any_tmu = true;
328 }
329 
330 enum emit_mode {
331     MODE_COUNT = 0,
332     MODE_EMIT,
333     MODE_LAST,
334 };
335 
336 /**
337  * For a TMU general store instruction:
338  *
339  * In MODE_COUNT mode, records the number of TMU writes required and flushes
340  * any outstanding TMU operations the instruction depends on, but it doesn't
341  * emit any actual register writes.
342  *
343  * In MODE_EMIT mode, emits the data register writes required by the
344  * instruction.
345  */
346 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)347 emit_tmu_general_store_writes(struct v3d_compile *c,
348                               enum emit_mode mode,
349                               nir_intrinsic_instr *instr,
350                               uint32_t base_const_offset,
351                               uint32_t *writemask,
352                               uint32_t *const_offset,
353                               uint32_t *type_size,
354                               uint32_t *tmu_writes)
355 {
356         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
357 
358         /* Find the first set of consecutive components that
359          * are enabled in the writemask and emit the TMUD
360          * instructions for them.
361          */
362         assert(*writemask != 0);
363         uint32_t first_component = ffs(*writemask) - 1;
364         uint32_t last_component = first_component;
365         while (*writemask & BITFIELD_BIT(last_component + 1))
366                 last_component++;
367 
368         assert(first_component <= last_component &&
369                last_component < instr->num_components);
370 
371         for (int i = first_component; i <= last_component; i++) {
372                 struct qreg data = ntq_get_src(c, instr->src[0], i);
373                 if (mode == MODE_COUNT)
374                         (*tmu_writes)++;
375                 else
376                         vir_MOV_dest(c, tmud, data);
377         }
378 
379         if (mode == MODE_EMIT) {
380                 /* Update the offset for the TMU write based on the
381                  * the first component we are writing.
382                  */
383                 *type_size = nir_src_bit_size(instr->src[0]) / 8;
384                 *const_offset =
385                         base_const_offset + first_component * (*type_size);
386 
387                 /* Clear these components from the writemask */
388                 uint32_t written_mask =
389                         BITFIELD_RANGE(first_component, *tmu_writes);
390                 (*writemask) &= ~written_mask;
391         }
392 }
393 
394 /**
395  * For a TMU general atomic instruction:
396  *
397  * In MODE_COUNT mode, records the number of TMU writes required and flushes
398  * any outstanding TMU operations the instruction depends on, but it doesn't
399  * emit any actual register writes.
400  *
401  * In MODE_EMIT mode, emits the data register writes required by the
402  * instruction.
403  */
404 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)405 emit_tmu_general_atomic_writes(struct v3d_compile *c,
406                                enum emit_mode mode,
407                                nir_intrinsic_instr *instr,
408                                uint32_t tmu_op,
409                                bool has_index,
410                                uint32_t *tmu_writes)
411 {
412         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
413 
414         struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
415         if (mode == MODE_COUNT)
416                 (*tmu_writes)++;
417         else
418                 vir_MOV_dest(c, tmud, data);
419 
420         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
421                 data = ntq_get_src(c, instr->src[2 + has_index], 0);
422                 if (mode == MODE_COUNT)
423                         (*tmu_writes)++;
424                 else
425                         vir_MOV_dest(c, tmud, data);
426         }
427 }
428 
429 /**
430  * For any TMU general instruction:
431  *
432  * In MODE_COUNT mode, records the number of TMU writes required to emit the
433  * address parameter and flushes any outstanding TMU operations the instruction
434  * depends on, but it doesn't emit any actual register writes.
435  *
436  * In MODE_EMIT mode, emits register writes required to emit the address.
437  */
438 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t dest_components,uint32_t * tmu_writes)439 emit_tmu_general_address_write(struct v3d_compile *c,
440                                enum emit_mode mode,
441                                nir_intrinsic_instr *instr,
442                                uint32_t config,
443                                bool dynamic_src,
444                                int offset_src,
445                                struct qreg base_offset,
446                                uint32_t const_offset,
447                                uint32_t dest_components,
448                                uint32_t *tmu_writes)
449 {
450         if (mode == MODE_COUNT) {
451                 (*tmu_writes)++;
452                 if (dynamic_src)
453                         ntq_get_src(c, instr->src[offset_src], 0);
454                 return;
455         }
456 
457         if (vir_in_nonuniform_control_flow(c)) {
458                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
459                            V3D_QPU_PF_PUSHZ);
460         }
461 
462         struct qreg tmua;
463         if (config == ~0)
464                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
465         else
466                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
467 
468         struct qinst *tmu;
469         if (dynamic_src) {
470                 struct qreg offset = base_offset;
471                 if (const_offset != 0) {
472                         offset = vir_ADD(c, offset,
473                                          vir_uniform_ui(c, const_offset));
474                 }
475                 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
476                 tmu = vir_ADD_dest(c, tmua, offset, data);
477         } else {
478                 if (const_offset != 0) {
479                         tmu = vir_ADD_dest(c, tmua, base_offset,
480                                            vir_uniform_ui(c, const_offset));
481                 } else {
482                         tmu = vir_MOV_dest(c, tmua, base_offset);
483                 }
484         }
485 
486         if (config != ~0) {
487                 tmu->uniform =
488                         vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
489         }
490 
491         if (vir_in_nonuniform_control_flow(c))
492                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
493 
494         tmu->ldtmu_count = dest_components;
495 }
496 
497 /**
498  * Implements indirect uniform loads and SSBO accesses through the TMU general
499  * memory access interface.
500  */
501 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch,bool is_global)502 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
503                      bool is_shared_or_scratch, bool is_global)
504 {
505         uint32_t tmu_op = v3d_general_tmu_op(instr);
506 
507         /* If we were able to replace atomic_add for an inc/dec, then we
508          * need/can to do things slightly different, like not loading the
509          * amount to add/sub, as that is implicit.
510          */
511         bool atomic_add_replaced =
512                 (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
513                  instr->intrinsic == nir_intrinsic_shared_atomic ||
514                  instr->intrinsic == nir_intrinsic_global_atomic) &&
515                 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
516                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
517                   tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
518 
519         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
520                          instr->intrinsic == nir_intrinsic_store_scratch ||
521                          instr->intrinsic == nir_intrinsic_store_shared ||
522                          instr->intrinsic == nir_intrinsic_store_global);
523 
524         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
525                         instr->intrinsic == nir_intrinsic_load_ubo ||
526                         instr->intrinsic == nir_intrinsic_load_ssbo ||
527                         instr->intrinsic == nir_intrinsic_load_scratch ||
528                         instr->intrinsic == nir_intrinsic_load_shared ||
529                         instr->intrinsic == nir_intrinsic_load_global ||
530                         instr->intrinsic == nir_intrinsic_load_global_constant);
531 
532         if (!is_load)
533                 c->tmu_dirty_rcl = true;
534 
535         if (is_global)
536                 c->has_global_address = true;
537 
538         bool has_index = !is_shared_or_scratch && !is_global;
539 
540         int offset_src;
541         if (instr->intrinsic == nir_intrinsic_load_uniform) {
542                 offset_src = 0;
543         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
544                    instr->intrinsic == nir_intrinsic_load_ubo ||
545                    instr->intrinsic == nir_intrinsic_load_scratch ||
546                    instr->intrinsic == nir_intrinsic_load_shared ||
547                    instr->intrinsic == nir_intrinsic_load_global ||
548                    instr->intrinsic == nir_intrinsic_load_global_constant ||
549                    atomic_add_replaced) {
550                 offset_src = 0 + has_index;
551         } else if (is_store) {
552                 offset_src = 1 + has_index;
553         } else {
554                 offset_src = 0 + has_index;
555         }
556 
557         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
558         uint32_t const_offset = 0;
559         if (!dynamic_src)
560                 const_offset = nir_src_as_uint(instr->src[offset_src]);
561 
562         struct qreg base_offset;
563         if (instr->intrinsic == nir_intrinsic_load_uniform) {
564                 const_offset += nir_intrinsic_base(instr);
565                 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
566                                           v3d_unit_data_create(0, const_offset));
567                 const_offset = 0;
568         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
569                 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
570                  * is gallium's constant buffer 0 in GL and push constants
571                  * in Vulkan)).
572                  */
573                 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
574                 base_offset =
575                         vir_uniform(c, QUNIFORM_UBO_ADDR,
576                                     v3d_unit_data_create(index, const_offset));
577                 const_offset = 0;
578         } else if (is_shared_or_scratch) {
579                 /* Shared and scratch variables have no buffer index, and all
580                  * start from a common base that we set up at the start of
581                  * dispatch.
582                  */
583                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
584                     instr->intrinsic == nir_intrinsic_store_scratch) {
585                         base_offset = c->spill_base;
586                 } else {
587                         base_offset = c->cs_shared_offset;
588                         const_offset += nir_intrinsic_base(instr);
589                 }
590         } else if (is_global) {
591                 /* Global load/store intrinsics use gloal addresses, so the
592                  * offset is the target address and we don't need to add it
593                  * to a base offset.
594                  */
595                 base_offset = vir_uniform_ui(c, 0);
596         } else {
597                 uint32_t idx = is_store ? 1 : 0;
598                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
599                                           nir_src_comp_as_uint(instr->src[idx], 0));
600         }
601 
602         /* We are ready to emit TMU register writes now, but before we actually
603          * emit them we need to flush outstanding TMU operations if any of our
604          * writes reads from the result of an outstanding TMU operation before
605          * we start the TMU sequence for this operation, since otherwise the
606          * flush could happen in the middle of the TMU sequence we are about to
607          * emit, which is illegal. To do this we run this logic twice, the
608          * first time it will count required register writes and flush pending
609          * TMU requests if necessary due to a dependency, and the second one
610          * will emit the actual TMU writes.
611          */
612         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
613         uint32_t base_const_offset = const_offset;
614         uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
615         uint32_t tmu_writes = 0;
616         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
617                 assert(mode == MODE_COUNT || tmu_writes > 0);
618 
619                 uint32_t type_size = 4;
620 
621                 if (is_store) {
622                         emit_tmu_general_store_writes(c, mode, instr,
623                                                       base_const_offset,
624                                                       &writemask,
625                                                       &const_offset,
626                                                       &type_size,
627                                                       &tmu_writes);
628                 } else if (!is_load && !atomic_add_replaced) {
629                         emit_tmu_general_atomic_writes(c, mode, instr,
630                                                        tmu_op, has_index,
631                                                        &tmu_writes);
632                 } else if (is_load) {
633                         type_size = instr->def.bit_size / 8;
634                 }
635 
636                 /* For atomics we use 32bit except for CMPXCHG, that we need
637                  * to use VEC2. For the rest of the cases we use the number of
638                  * tmud writes we did to decide the type. For cache operations
639                  * the type is ignored.
640                  */
641                 uint32_t config = 0;
642                 if (mode == MODE_EMIT) {
643                         uint32_t num_components;
644                         if (is_load || atomic_add_replaced) {
645                                 num_components = instr->num_components;
646                         } else {
647                                 assert(tmu_writes > 0);
648                                 num_components = tmu_writes - 1;
649                         }
650                         bool is_atomic =
651                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
652                                 V3D_TMU_OP_TYPE_ATOMIC;
653 
654                         /* Only load per-quad if we can be certain that all
655                          * lines in the quad are active. Notice that demoted
656                          * invocations, unlike terminated ones, are still
657                          * active: we want to skip memory writes for them but
658                          * loads should still work.
659                          */
660                         uint32_t perquad =
661                                 is_load && !vir_in_nonuniform_control_flow(c) &&
662                                 ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
663                                   c->s->info.fs.needs_quad_helper_invocations &&
664                                   !c->emitted_discard) ||
665                                  c->s->info.uses_wide_subgroup_intrinsics) ?
666                                 GENERAL_TMU_LOOKUP_PER_QUAD :
667                                 GENERAL_TMU_LOOKUP_PER_PIXEL;
668                         config = 0xffffff00 | tmu_op << 3 | perquad;
669 
670                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
671                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
672                         } else if (is_atomic || num_components == 1) {
673                                 switch (type_size) {
674                                 case 4:
675                                         config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
676                                         break;
677                                 case 2:
678                                         config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
679                                         break;
680                                 case 1:
681                                         config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
682                                         break;
683                                 default:
684                                         unreachable("Unsupported bitsize");
685                                 }
686                         } else {
687                                 assert(type_size == 4);
688                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
689                                           num_components - 2;
690                         }
691                 }
692 
693                 emit_tmu_general_address_write(c, mode, instr, config,
694                                                dynamic_src, offset_src,
695                                                base_offset, const_offset,
696                                                dest_components, &tmu_writes);
697 
698                 assert(tmu_writes > 0);
699                 if (mode == MODE_COUNT) {
700                         /* Make sure we won't exceed the 16-entry TMU
701                          * fifo if each thread is storing at the same
702                          * time.
703                          */
704                         while (tmu_writes > 16 / c->threads)
705                                 c->threads /= 2;
706 
707                         /* If pipelining this TMU operation would
708                          * overflow TMU fifos, we need to flush.
709                          */
710                         if (ntq_tmu_fifo_overflow(c, dest_components))
711                                 ntq_flush_tmu(c);
712                 } else {
713                         /* Delay emission of the thread switch and
714                          * LDTMU/TMUWT until we really need to do it to
715                          * improve pipelining.
716                          */
717                         const uint32_t component_mask =
718                                 (1 << dest_components) - 1;
719                         ntq_add_pending_tmu_flush(c, &instr->def,
720                                                   component_mask);
721                 }
722         }
723 
724         /* nir_lower_wrmasks should've ensured that any writemask on a store
725          * operation only has consecutive bits set, in which case we should've
726          * processed the full writemask above.
727          */
728         assert(writemask == 0);
729 }
730 
731 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_def * def)732 ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
733 {
734         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
735                                           def->num_components);
736         _mesa_hash_table_insert(c->def_ht, def, qregs);
737         return qregs;
738 }
739 
740 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)741 is_ld_signal(const struct v3d_qpu_sig *sig)
742 {
743         return (sig->ldunif ||
744                 sig->ldunifa ||
745                 sig->ldunifrf ||
746                 sig->ldunifarf ||
747                 sig->ldtmu ||
748                 sig->ldvary ||
749                 sig->ldvpm ||
750                 sig->ldtlb ||
751                 sig->ldtlbu);
752 }
753 
754 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)755 is_ldunif_signal(const struct v3d_qpu_sig *sig)
756 {
757         return sig->ldunif || sig->ldunifrf;
758 }
759 
760 /**
761  * This function is responsible for getting VIR results into the associated
762  * storage for a NIR instruction.
763  *
764  * If it's a NIR SSA def, then we just set the associated hash table entry to
765  * the new result.
766  *
767  * If it's a NIR reg, then we need to update the existing qreg assigned to the
768  * NIR destination with the incoming value.  To do that without introducing
769  * new MOVs, we require that the incoming qreg either be a uniform, or be
770  * SSA-defined by the previous VIR instruction in the block and rewritable by
771  * this function.  That lets us sneak ahead and insert the SF flag beforehand
772  * (knowing that the previous instruction doesn't depend on flags) and rewrite
773  * its destination to be the NIR reg's destination
774  */
775 void
ntq_store_def(struct v3d_compile * c,nir_def * def,int chan,struct qreg result)776 ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
777               struct qreg result)
778 {
779         struct qinst *last_inst = NULL;
780         if (!list_is_empty(&c->cur_block->instructions))
781                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
782 
783         bool is_reused_uniform =
784                 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
785                 last_inst != c->defs[result.index];
786 
787         assert(result.file == QFILE_TEMP && last_inst &&
788                (last_inst == c->defs[result.index] || is_reused_uniform));
789 
790         nir_intrinsic_instr *store = nir_store_reg_for_def(def);
791         if (store == NULL) {
792                 assert(chan < def->num_components);
793 
794                 struct qreg *qregs;
795                 struct hash_entry *entry =
796                         _mesa_hash_table_search(c->def_ht, def);
797 
798                 if (entry)
799                         qregs = entry->data;
800                 else
801                         qregs = ntq_init_ssa_def(c, def);
802 
803                 qregs[chan] = result;
804         } else {
805                 nir_def *reg = store->src[1].ssa;
806                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
807                 assert(nir_intrinsic_base(store) == 0);
808                 assert(nir_intrinsic_num_array_elems(decl) == 0);
809                 struct hash_entry *entry =
810                         _mesa_hash_table_search(c->def_ht, reg);
811                 struct qreg *qregs = entry->data;
812 
813                 /* If the previous instruction can't be predicated for
814                  * the store into the nir_register, then emit a MOV
815                  * that can be.
816                  */
817                 if (is_reused_uniform ||
818                     (vir_in_nonuniform_control_flow(c) &&
819                      is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
820                         result = vir_MOV(c, result);
821                         last_inst = c->defs[result.index];
822                 }
823 
824                 /* We know they're both temps, so just rewrite index. */
825                 c->defs[last_inst->dst.index] = NULL;
826                 last_inst->dst.index = qregs[chan].index;
827 
828                 /* If we're in control flow, then make this update of the reg
829                  * conditional on the execution mask.
830                  */
831                 if (vir_in_nonuniform_control_flow(c)) {
832                         last_inst->dst.index = qregs[chan].index;
833 
834                         /* Set the flags to the current exec mask.
835                          */
836                         c->cursor = vir_before_inst(last_inst);
837                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
838                                    V3D_QPU_PF_PUSHZ);
839                         c->cursor = vir_after_inst(last_inst);
840 
841                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
842                 }
843         }
844 }
845 
846 /**
847  * This looks up the qreg associated with a particular ssa/reg used as a source
848  * in any instruction.
849  *
850  * It is expected that the definition for any NIR value read as a source has
851  * been emitted by a previous instruction, however, in the case of TMU
852  * operations we may have postponed emission of the thread switch and LDTMUs
853  * required to read the TMU results until the results are actually used to
854  * improve pipelining, which then would lead to us not finding them here
855  * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
856  * (for registers), meaning that we need to flush outstanding TMU operations
857  * to read the correct value.
858  */
859 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)860 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
861 {
862         struct hash_entry *entry;
863 
864         nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
865         if (load == NULL) {
866                 assert(i < src.ssa->num_components);
867 
868                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
869                 if (!entry) {
870                         ntq_flush_tmu(c);
871                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
872                 }
873         } else {
874                 nir_def *reg = load->src[0].ssa;
875                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
876                 assert(nir_intrinsic_base(load) == 0);
877                 assert(nir_intrinsic_num_array_elems(decl) == 0);
878                 assert(i < nir_intrinsic_num_components(decl));
879 
880                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
881                         ntq_flush_tmu(c);
882                 entry = _mesa_hash_table_search(c->def_ht, reg);
883         }
884         assert(entry);
885 
886         struct qreg *qregs = entry->data;
887         return qregs[i];
888 }
889 
890 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)891 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
892                 unsigned src)
893 {
894         struct qreg r = ntq_get_src(c, instr->src[src].src,
895                                     instr->src[src].swizzle[0]);
896 
897         return r;
898 };
899 
900 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)901 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
902 {
903         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
904 }
905 
906 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)907 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
908 {
909         unsigned unit = instr->texture_index;
910         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
911         int dest_size = nir_tex_instr_dest_size(instr);
912 
913         struct qreg lod = c->undef;
914         if (lod_index != -1)
915                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
916 
917         for (int i = 0; i < dest_size; i++) {
918                 assert(i < 3);
919                 enum quniform_contents contents;
920 
921                 if (instr->is_array && i == dest_size - 1)
922                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
923                 else
924                         contents = QUNIFORM_TEXTURE_WIDTH + i;
925 
926                 struct qreg size = vir_uniform(c, contents, unit);
927 
928                 switch (instr->sampler_dim) {
929                 case GLSL_SAMPLER_DIM_1D:
930                 case GLSL_SAMPLER_DIM_2D:
931                 case GLSL_SAMPLER_DIM_MS:
932                 case GLSL_SAMPLER_DIM_3D:
933                 case GLSL_SAMPLER_DIM_CUBE:
934                 case GLSL_SAMPLER_DIM_BUF:
935                 case GLSL_SAMPLER_DIM_EXTERNAL:
936                         /* Don't minify the array size. */
937                         if (!(instr->is_array && i == dest_size - 1)) {
938                                 size = ntq_minify(c, size, lod);
939                         }
940                         break;
941 
942                 case GLSL_SAMPLER_DIM_RECT:
943                         /* There's no LOD field for rects */
944                         break;
945 
946                 default:
947                         unreachable("Bad sampler type");
948                 }
949 
950                 ntq_store_def(c, &instr->def, i, size);
951         }
952 }
953 
954 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)955 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
956 {
957         unsigned unit = instr->texture_index;
958 
959         /* Since each texture sampling op requires uploading uniforms to
960          * reference the texture, there's no HW support for texture size and
961          * you just upload uniforms containing the size.
962          */
963         switch (instr->op) {
964         case nir_texop_query_levels:
965                 ntq_store_def(c, &instr->def, 0,
966                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
967                 return;
968         case nir_texop_texture_samples:
969                 ntq_store_def(c, &instr->def, 0,
970                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
971                 return;
972         case nir_texop_txs:
973                 ntq_emit_txs(c, instr);
974                 return;
975         default:
976                 break;
977         }
978 
979         v3d_vir_emit_tex(c, instr);
980 }
981 
982 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)983 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
984 {
985         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
986         if (is_cos)
987                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
988 
989         struct qreg periods = vir_FROUND(c, input);
990         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
991         return vir_XOR(c, sin_output, vir_SHL(c,
992                                               vir_FTOIN(c, periods),
993                                               vir_uniform_ui(c, -1)));
994 }
995 
996 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)997 ntq_fsign(struct v3d_compile *c, struct qreg src)
998 {
999         struct qreg t = vir_get_temp(c);
1000 
1001         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
1002         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
1003         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
1004         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
1005         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
1006         return vir_MOV(c, t);
1007 }
1008 
1009 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)1010 emit_fragcoord_input(struct v3d_compile *c, int attr)
1011 {
1012         c->inputs[attr * 4 + 0] = vir_FXCD(c);
1013         c->inputs[attr * 4 + 1] = vir_FYCD(c);
1014         c->inputs[attr * 4 + 2] = c->payload_z;
1015         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
1016 }
1017 
1018 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg c_reg)1019 emit_smooth_varying(struct v3d_compile *c,
1020                     struct qreg vary, struct qreg w, struct qreg c_reg)
1021 {
1022         return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
1023 }
1024 
1025 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1026 emit_noperspective_varying(struct v3d_compile *c,
1027                            struct qreg vary, struct qreg c_reg)
1028 {
1029         return vir_FADD(c, vir_MOV(c, vary), c_reg);
1030 }
1031 
1032 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1033 emit_flat_varying(struct v3d_compile *c,
1034                   struct qreg vary, struct qreg c_reg)
1035 {
1036         vir_MOV_dest(c, c->undef, vary);
1037         return vir_MOV(c, c_reg);
1038 }
1039 
1040 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1041 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1042                       int8_t input_idx, uint8_t swizzle, int array_index)
1043 {
1044         struct qreg c_reg; /* C coefficient */
1045 
1046         if (c->devinfo->has_accumulators)
1047                 c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1048         else
1049                 c_reg = vir_reg(QFILE_REG, 0);
1050 
1051         struct qinst *ldvary = NULL;
1052         struct qreg vary;
1053         ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1054                               c->undef, c->undef);
1055         ldvary->qpu.sig.ldvary = true;
1056         vary = vir_emit_def(c, ldvary);
1057 
1058         /* Store the input value before interpolation so we can implement
1059          * GLSL's interpolateAt functions if the shader uses them.
1060          */
1061         if (input_idx >= 0) {
1062                 assert(var);
1063                 c->interp[input_idx].vp = vary;
1064                 c->interp[input_idx].C = vir_MOV(c, c_reg);
1065                 c->interp[input_idx].mode = var->data.interpolation;
1066         }
1067 
1068         /* For gl_PointCoord input or distance along a line, we'll be called
1069          * with no nir_variable, and we don't count toward VPM size so we
1070          * don't track an input slot.
1071          */
1072         if (!var) {
1073                 assert(input_idx < 0);
1074                 return emit_smooth_varying(c, vary, c->payload_w, c_reg);
1075         }
1076 
1077         int i = c->num_inputs++;
1078         c->input_slots[i] =
1079                 v3d_slot_from_slot_and_component(var->data.location +
1080                                                  array_index, swizzle);
1081 
1082         struct qreg result;
1083         switch (var->data.interpolation) {
1084         case INTERP_MODE_NONE:
1085         case INTERP_MODE_SMOOTH:
1086                 if (var->data.centroid) {
1087                         BITSET_SET(c->centroid_flags, i);
1088                         result = emit_smooth_varying(c, vary,
1089                                                      c->payload_w_centroid, c_reg);
1090                 } else {
1091                         result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
1092                 }
1093                 break;
1094 
1095         case INTERP_MODE_NOPERSPECTIVE:
1096                 BITSET_SET(c->noperspective_flags, i);
1097                 result = emit_noperspective_varying(c, vary, c_reg);
1098                 break;
1099 
1100         case INTERP_MODE_FLAT:
1101                 BITSET_SET(c->flat_shade_flags, i);
1102                 result = emit_flat_varying(c, vary, c_reg);
1103                 break;
1104 
1105         default:
1106                 unreachable("Bad interp mode");
1107         }
1108 
1109         if (input_idx >= 0)
1110                 c->inputs[input_idx] = result;
1111         return result;
1112 }
1113 
1114 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1115 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1116                     int array_index, unsigned nelem)
1117 {
1118         for (int i = 0; i < nelem ; i++) {
1119                 int chan = var->data.location_frac + i;
1120                 int input_idx = (base_attr + array_index) * 4 + chan;
1121                 emit_fragment_varying(c, var, input_idx, chan, array_index);
1122         }
1123 }
1124 
1125 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1126 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1127                             int array_index)
1128 {
1129         /* Compact variables are scalar arrays where each set of 4 elements
1130          * consumes a single location.
1131          */
1132         int loc_offset = array_index / 4;
1133         int chan = var->data.location_frac + array_index % 4;
1134         int input_idx = (attr + loc_offset) * 4  + chan;
1135         emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1136 }
1137 
1138 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1139 add_output(struct v3d_compile *c,
1140            uint32_t decl_offset,
1141            uint8_t slot,
1142            uint8_t swizzle)
1143 {
1144         uint32_t old_array_size = c->outputs_array_size;
1145         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1146                           decl_offset + 1);
1147 
1148         if (old_array_size != c->outputs_array_size) {
1149                 c->output_slots = reralloc(c,
1150                                            c->output_slots,
1151                                            struct v3d_varying_slot,
1152                                            c->outputs_array_size);
1153         }
1154 
1155         c->output_slots[decl_offset] =
1156                 v3d_slot_from_slot_and_component(slot, swizzle);
1157 }
1158 
1159 /**
1160  * If compare_instr is a valid comparison instruction, emits the
1161  * compare_instr's comparison and returns the sel_instr's return value based
1162  * on the compare_instr's result.
1163  */
1164 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1165 ntq_emit_comparison(struct v3d_compile *c,
1166                     nir_alu_instr *compare_instr,
1167                     enum v3d_qpu_cond *out_cond)
1168 {
1169         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1170         struct qreg src1;
1171         if (nir_op_infos[compare_instr->op].num_inputs > 1)
1172                 src1 = ntq_get_alu_src(c, compare_instr, 1);
1173         bool cond_invert = false;
1174         struct qreg nop = vir_nop_reg();
1175 
1176         switch (compare_instr->op) {
1177         case nir_op_feq32:
1178         case nir_op_seq:
1179                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1180                 break;
1181         case nir_op_ieq32:
1182                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1183                 break;
1184 
1185         case nir_op_fneu32:
1186         case nir_op_sne:
1187                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1188                 cond_invert = true;
1189                 break;
1190         case nir_op_ine32:
1191                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1192                 cond_invert = true;
1193                 break;
1194 
1195         case nir_op_fge32:
1196         case nir_op_sge:
1197                 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1198                 break;
1199         case nir_op_ige32:
1200                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1201                 cond_invert = true;
1202                 break;
1203         case nir_op_uge32:
1204                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1205                 cond_invert = true;
1206                 break;
1207 
1208         case nir_op_slt:
1209         case nir_op_flt32:
1210                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1211                 break;
1212         case nir_op_ilt32:
1213                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1214                 break;
1215         case nir_op_ult32:
1216                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1217                 break;
1218 
1219         default:
1220                 return false;
1221         }
1222 
1223         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1224 
1225         return true;
1226 }
1227 
1228 /* Finds an ALU instruction that generates our src value that could
1229  * (potentially) be greedily emitted in the consuming instruction.
1230  */
1231 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1232 ntq_get_alu_parent(nir_src src)
1233 {
1234         if (src.ssa->parent_instr->type != nir_instr_type_alu)
1235                 return NULL;
1236         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1237         if (!instr)
1238                 return NULL;
1239 
1240         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1241          * moving emission of the ALU instr down past another write of the
1242          * src.
1243          */
1244         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1245                 if (nir_load_reg_for_def(instr->src[i].src.ssa))
1246                         return NULL;
1247         }
1248 
1249         return instr;
1250 }
1251 
1252 /* Turns a NIR bool into a condition code to predicate on. */
1253 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1254 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1255 {
1256         struct qreg qsrc = ntq_get_src(c, src, 0);
1257         /* skip if we already have src in the flags */
1258         if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1259                 return c->flags_cond;
1260 
1261         nir_alu_instr *compare = ntq_get_alu_parent(src);
1262         if (!compare)
1263                 goto out;
1264 
1265         enum v3d_qpu_cond cond;
1266         if (ntq_emit_comparison(c, compare, &cond))
1267                 return cond;
1268 
1269 out:
1270 
1271         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1272                    V3D_QPU_PF_PUSHZ);
1273         return V3D_QPU_COND_IFNA;
1274 }
1275 
1276 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1277 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1278 {
1279         struct qreg result =
1280                 vir_MOV(c, vir_SEL(c, cond,
1281                                    vir_uniform_ui(c, ~0),
1282                                    vir_uniform_ui(c, 0)));
1283         c->flags_temp = result.index;
1284         c->flags_cond = cond;
1285         return result;
1286 }
1287 
1288 static struct qreg
ntq_emit_cond_to_int(struct v3d_compile * c,enum v3d_qpu_cond cond)1289 ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
1290 {
1291         struct qreg result =
1292                 vir_MOV(c, vir_SEL(c, cond,
1293                                    vir_uniform_ui(c, 1),
1294                                    vir_uniform_ui(c, 0)));
1295         c->flags_temp = result.index;
1296         c->flags_cond = cond;
1297         return result;
1298 }
1299 
1300 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1301 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1302 {
1303    /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1304     * method and seems to be using RTE by default, so we need to implement
1305     * RTZ rounding in software.
1306     */
1307    struct qreg rf16 = vir_FMOV(c, f32);
1308    vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
1309 
1310    struct qreg rf32 = vir_FMOV(c, rf16);
1311    vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
1312 
1313    struct qreg f32_abs = vir_FMOV(c, f32);
1314    vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1315 
1316    struct qreg rf32_abs = vir_FMOV(c, rf32);
1317    vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1318 
1319    vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
1320               V3D_QPU_PF_PUSHN);
1321    return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1322                   vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
1323 }
1324 
1325 /**
1326  * Takes the result value of a signed integer width conversion from a smaller
1327  * type to a larger type and if needed, it applies sign extension to it.
1328  */
1329 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1330 sign_extend(struct v3d_compile *c,
1331             struct qreg value,
1332             uint32_t src_bit_size,
1333             uint32_t dst_bit_size)
1334 {
1335         assert(src_bit_size < dst_bit_size);
1336 
1337         struct qreg tmp = vir_MOV(c, value);
1338 
1339         /* Do we need to sign-extend? */
1340         uint32_t sign_mask = 1 << (src_bit_size - 1);
1341         struct qinst *sign_check =
1342                 vir_AND_dest(c, vir_nop_reg(),
1343                              tmp, vir_uniform_ui(c, sign_mask));
1344         vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1345 
1346         /* If so, fill in leading sign bits */
1347         uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1348                                ((1ull << dst_bit_size) - 1);
1349         struct qinst *extend_inst =
1350                 vir_OR_dest(c, tmp, tmp,
1351                             vir_uniform_ui(c, extend_bits));
1352         vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1353 
1354         return tmp;
1355 }
1356 
1357 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1358 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1359 {
1360         /* Vectors are special in that they have non-scalarized writemasks,
1361          * and just take the first swizzle channel for each argument in order
1362          * into each writemask channel.
1363          */
1364         if (instr->op == nir_op_vec2 ||
1365             instr->op == nir_op_vec3 ||
1366             instr->op == nir_op_vec4) {
1367                 struct qreg srcs[4];
1368                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1369                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1370                                               instr->src[i].swizzle[0]);
1371                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1372                         ntq_store_def(c, &instr->def, i,
1373                                       vir_MOV(c, srcs[i]));
1374                 return;
1375         }
1376 
1377         /* General case: We can just grab the one used channel per src. */
1378         struct qreg src[nir_op_infos[instr->op].num_inputs];
1379         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1380                 src[i] = ntq_get_alu_src(c, instr, i);
1381         }
1382 
1383         struct qreg result;
1384 
1385         switch (instr->op) {
1386         case nir_op_mov:
1387                 result = vir_MOV(c, src[0]);
1388                 break;
1389 
1390         case nir_op_fneg:
1391                 result = vir_XOR(c, src[0], vir_uniform_ui(c, UINT32_C(1) << 31));
1392                 break;
1393         case nir_op_ineg:
1394                 result = vir_NEG(c, src[0]);
1395                 break;
1396 
1397         case nir_op_fmul:
1398                 result = vir_FMUL(c, src[0], src[1]);
1399                 break;
1400         case nir_op_fadd:
1401                 result = vir_FADD(c, src[0], src[1]);
1402                 break;
1403         case nir_op_fsub:
1404                 result = vir_FSUB(c, src[0], src[1]);
1405                 break;
1406         case nir_op_fmin:
1407                 result = vir_FMIN(c, src[0], src[1]);
1408                 break;
1409         case nir_op_fmax:
1410                 result = vir_FMAX(c, src[0], src[1]);
1411                 break;
1412 
1413         case nir_op_f2i32: {
1414                 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1415                 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1416                         result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1417                 } else {
1418                         result = vir_FTOIZ(c, src[0]);
1419                 }
1420                 if (nir_src_bit_size(instr->src[0].src) == 16)
1421                         vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1422                 break;
1423         }
1424 
1425         case nir_op_f2u32:
1426                 result = vir_FTOUZ(c, src[0]);
1427                 if (nir_src_bit_size(instr->src[0].src) == 16)
1428                         vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1429                 break;
1430         case nir_op_i2f32: {
1431                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1432                 assert(bit_size <= 32);
1433                 result = src[0];
1434                 if (bit_size < 32) {
1435                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1436                         result = vir_AND(c, result, vir_uniform_ui(c, mask));
1437                         result = sign_extend(c, result, bit_size, 32);
1438                 }
1439                 result = vir_ITOF(c, result);
1440                 break;
1441         }
1442         case nir_op_u2f32: {
1443                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1444                 assert(bit_size <= 32);
1445                 result = src[0];
1446                 if (bit_size < 32) {
1447                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1448                         result = vir_AND(c, result, vir_uniform_ui(c, mask));
1449                 }
1450                 result = vir_UTOF(c, result);
1451                 break;
1452         }
1453         case nir_op_b2f16:
1454                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0x3c00));
1455                 break;
1456         case nir_op_b2f32:
1457                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1458                 break;
1459         case nir_op_b2i8:
1460         case nir_op_b2i16:
1461         case nir_op_b2i32:
1462                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1463                 break;
1464 
1465         case nir_op_i2f16: {
1466                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1467                 assert(bit_size <= 32);
1468                 if (bit_size < 32) {
1469                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1470                         result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1471                         result = sign_extend(c, result, bit_size, 32);
1472                 } else {
1473                         result = src[0];
1474                 }
1475                 result = vir_ITOF(c, result);
1476                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1477                 break;
1478         }
1479 
1480         case nir_op_u2f16: {
1481                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1482                 assert(bit_size <= 32);
1483                 if (bit_size < 32) {
1484                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1485                         result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1486                 } else {
1487                         result = src[0];
1488                 }
1489                 result = vir_UTOF(c, result);
1490                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1491                 break;
1492         }
1493 
1494         case nir_op_f2f16:
1495         case nir_op_f2f16_rtne:
1496                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1497                 result = vir_FMOV(c, src[0]);
1498                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1499                 break;
1500 
1501         case nir_op_f2f16_rtz:
1502                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1503                 result = f2f16_rtz(c, src[0]);
1504                 break;
1505 
1506         case nir_op_f2f32:
1507                 assert(nir_src_bit_size(instr->src[0].src) == 16);
1508                 result = vir_FMOV(c, src[0]);
1509                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1510                 break;
1511 
1512         case nir_op_i2i16: {
1513                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1514                 assert(bit_size == 32 || bit_size == 8);
1515                 if (bit_size == 32) {
1516                         /* We don't have integer pack/unpack methods for
1517                          * converting between 16-bit and 32-bit, so we implement
1518                          * the conversion manually by truncating the src.
1519                          */
1520                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1521                 } else {
1522                         struct qreg tmp = vir_AND(c, src[0],
1523                                                   vir_uniform_ui(c, 0xff));
1524                         result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1525                 }
1526                 break;
1527         }
1528 
1529         case nir_op_u2u16: {
1530                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1531                 assert(bit_size == 32 || bit_size == 8);
1532 
1533                 /* We don't have integer pack/unpack methods for converting
1534                  * between 16-bit and 32-bit, so we implement the conversion
1535                  * manually by truncating the src. For the 8-bit case, we
1536                  * want to make sure we don't copy garbage from any of the
1537                  * 24 MSB bits.
1538                  */
1539                 if (bit_size == 32)
1540                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1541                 else
1542                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1543                 break;
1544         }
1545 
1546         case nir_op_i2i8:
1547         case nir_op_u2u8:
1548                 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1549                        nir_src_bit_size(instr->src[0].src) == 16);
1550                 /* We don't have integer pack/unpack methods for converting
1551                  * between 8-bit and 32-bit, so we implement the conversion
1552                  * manually by truncating the src.
1553                  */
1554                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1555                 break;
1556 
1557         case nir_op_u2u32: {
1558                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1559                 assert(bit_size == 16 || bit_size == 8);
1560 
1561                 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1562                  * from the src but we make sure to clear any garbage bits that
1563                  * may be present in the invalid src bits.
1564                  */
1565                 uint32_t mask = (1 << bit_size) - 1;
1566                 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1567                 break;
1568         }
1569 
1570         case nir_op_i2i32: {
1571                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1572                 assert(bit_size == 16 || bit_size == 8);
1573 
1574                 uint32_t mask = (1 << bit_size) - 1;
1575                 struct qreg tmp = vir_AND(c, src[0],
1576                                           vir_uniform_ui(c, mask));
1577 
1578                 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1579                 break;
1580         }
1581 
1582         case nir_op_iadd:
1583                 result = vir_ADD(c, src[0], src[1]);
1584                 break;
1585         case nir_op_ushr:
1586                 result = vir_SHR(c, src[0], src[1]);
1587                 break;
1588         case nir_op_isub:
1589                 result = vir_SUB(c, src[0], src[1]);
1590                 break;
1591         case nir_op_ishr:
1592                 result = vir_ASR(c, src[0], src[1]);
1593                 break;
1594         case nir_op_ishl:
1595                 result = vir_SHL(c, src[0], src[1]);
1596                 break;
1597         case nir_op_imin:
1598                 result = vir_MIN(c, src[0], src[1]);
1599                 break;
1600         case nir_op_umin:
1601                 result = vir_UMIN(c, src[0], src[1]);
1602                 break;
1603         case nir_op_imax:
1604                 result = vir_MAX(c, src[0], src[1]);
1605                 break;
1606         case nir_op_umax:
1607                 result = vir_UMAX(c, src[0], src[1]);
1608                 break;
1609         case nir_op_iand:
1610                 result = vir_AND(c, src[0], src[1]);
1611                 break;
1612         case nir_op_ior:
1613                 result = vir_OR(c, src[0], src[1]);
1614                 break;
1615         case nir_op_ixor:
1616                 result = vir_XOR(c, src[0], src[1]);
1617                 break;
1618         case nir_op_inot:
1619                 result = vir_NOT(c, src[0]);
1620                 break;
1621 
1622         case nir_op_uclz:
1623                 result = vir_CLZ(c, src[0]);
1624                 break;
1625 
1626         case nir_op_imul:
1627                 result = vir_UMUL(c, src[0], src[1]);
1628                 break;
1629 
1630         case nir_op_seq:
1631         case nir_op_sne:
1632         case nir_op_sge:
1633         case nir_op_slt: {
1634                 enum v3d_qpu_cond cond;
1635                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1636                 assert(ok);
1637                 result = vir_MOV(c, vir_SEL(c, cond,
1638                                             vir_uniform_f(c, 1.0),
1639                                             vir_uniform_f(c, 0.0)));
1640                 c->flags_temp = result.index;
1641                 c->flags_cond = cond;
1642                 break;
1643         }
1644 
1645         case nir_op_feq32:
1646         case nir_op_fneu32:
1647         case nir_op_fge32:
1648         case nir_op_flt32:
1649         case nir_op_ieq32:
1650         case nir_op_ine32:
1651         case nir_op_ige32:
1652         case nir_op_uge32:
1653         case nir_op_ilt32:
1654         case nir_op_ult32: {
1655                 enum v3d_qpu_cond cond;
1656                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1657                 assert(ok);
1658                 result = ntq_emit_cond_to_bool(c, cond);
1659                 break;
1660         }
1661 
1662         case nir_op_b32csel:
1663                 result = vir_MOV(c,
1664                                  vir_SEL(c,
1665                                          ntq_emit_bool_to_cond(c, instr->src[0].src),
1666                                          src[1], src[2]));
1667                 break;
1668 
1669         case nir_op_fcsel:
1670                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1671                            V3D_QPU_PF_PUSHZ);
1672                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1673                                             src[1], src[2]));
1674                 break;
1675 
1676         case nir_op_frcp:
1677                 result = vir_RECIP(c, src[0]);
1678                 break;
1679         case nir_op_frsq:
1680                 result = vir_RSQRT(c, src[0]);
1681                 break;
1682         case nir_op_fexp2:
1683                 result = vir_EXP(c, src[0]);
1684                 break;
1685         case nir_op_flog2:
1686                 result = vir_LOG(c, src[0]);
1687                 break;
1688 
1689         case nir_op_fceil:
1690                 result = vir_FCEIL(c, src[0]);
1691                 break;
1692         case nir_op_ffloor:
1693                 result = vir_FFLOOR(c, src[0]);
1694                 break;
1695         case nir_op_fround_even:
1696                 result = vir_FROUND(c, src[0]);
1697                 break;
1698         case nir_op_ftrunc:
1699                 result = vir_FTRUNC(c, src[0]);
1700                 break;
1701 
1702         case nir_op_fsin:
1703                 result = ntq_fsincos(c, src[0], false);
1704                 break;
1705         case nir_op_fcos:
1706                 result = ntq_fsincos(c, src[0], true);
1707                 break;
1708 
1709         case nir_op_fsign:
1710                 result = ntq_fsign(c, src[0]);
1711                 break;
1712 
1713         case nir_op_fabs: {
1714                 result = vir_FMOV(c, src[0]);
1715                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1716                 break;
1717         }
1718 
1719         case nir_op_iabs:
1720                 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1721                 break;
1722 
1723         case nir_op_uadd_carry:
1724                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1725                            V3D_QPU_PF_PUSHC);
1726                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1727                 break;
1728 
1729         case nir_op_usub_borrow:
1730                 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
1731                            V3D_QPU_PF_PUSHC);
1732                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1733                 break;
1734 
1735         case nir_op_pack_half_2x16_split:
1736                 result = vir_VFPACK(c, src[0], src[1]);
1737                 break;
1738 
1739         case nir_op_pack_2x32_to_2x16_v3d:
1740                 result = vir_VPACK(c, src[0], src[1]);
1741                 break;
1742 
1743         case nir_op_pack_32_to_r11g11b10_v3d:
1744                 result = vir_V11FPACK(c, src[0], src[1]);
1745                 break;
1746 
1747         case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
1748                 result = vir_V10PACK(c, src[0], src[1]);
1749                 break;
1750 
1751         case nir_op_pack_4x16_to_4x8_v3d:
1752                 result = vir_V8PACK(c, src[0], src[1]);
1753                 break;
1754 
1755         case nir_op_unpack_half_2x16_split_x:
1756                 result = vir_FMOV(c, src[0]);
1757                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1758                 break;
1759 
1760         case nir_op_unpack_half_2x16_split_y:
1761                 result = vir_FMOV(c, src[0]);
1762                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1763                 break;
1764 
1765         case nir_op_pack_2x16_to_unorm_2x8_v3d:
1766                 result = vir_VFTOUNORM8(c, src[0]);
1767                 break;
1768 
1769         case nir_op_pack_2x16_to_snorm_2x8_v3d:
1770                 result = vir_VFTOSNORM8(c, src[0]);
1771                 break;
1772 
1773         case nir_op_pack_2x16_to_unorm_2x10_v3d:
1774                 result = vir_VFTOUNORM10LO(c, src[0]);
1775                 break;
1776 
1777         case nir_op_pack_2x16_to_unorm_10_2_v3d:
1778                 result = vir_VFTOUNORM10HI(c, src[0]);
1779                 break;
1780 
1781         case nir_op_f2unorm_16_v3d:
1782                 result = vir_FTOUNORM16(c, src[0]);
1783                 break;
1784 
1785         case nir_op_f2snorm_16_v3d:
1786                 result = vir_FTOSNORM16(c, src[0]);
1787                 break;
1788 
1789         case nir_op_fsat:
1790                 assert(v3d_device_has_unpack_sat(c->devinfo));
1791                 result = vir_FMOV(c, src[0]);
1792                 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_SAT);
1793                 break;
1794 
1795         case nir_op_fsat_signed:
1796                 assert(v3d_device_has_unpack_sat(c->devinfo));
1797                 result = vir_FMOV(c, src[0]);
1798                 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_NSAT);
1799                 break;
1800 
1801         case nir_op_fclamp_pos:
1802                 assert(v3d_device_has_unpack_max0(c->devinfo));
1803                 result = vir_FMOV(c, src[0]);
1804                 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_MAX0);
1805                 break;
1806 
1807         default:
1808                 fprintf(stderr, "unknown NIR ALU inst: ");
1809                 nir_print_instr(&instr->instr, stderr);
1810                 fprintf(stderr, "\n");
1811                 abort();
1812         }
1813 
1814         ntq_store_def(c, &instr->def, 0, result);
1815 }
1816 
1817 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1818  * specifier.  They come from a register that's preloaded with 0xffffffff
1819  * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
1820  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1821  */
1822 #define TLB_TYPE_F16_COLOR         (3 << 6)
1823 #define TLB_TYPE_I32_COLOR         (1 << 6)
1824 #define TLB_TYPE_F32_COLOR         (0 << 6)
1825 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1826 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1827 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1828 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1829 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1830 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1831 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1832 
1833 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1834  * flag is set.
1835  */
1836 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1837 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1838 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1839 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1840 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1841 
1842 /* Stencil is a single 32-bit write. */
1843 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1844 
1845 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1846 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1847 {
1848         if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1849                 return;
1850 
1851         struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1852         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1853 
1854         nir_variable *var = c->output_color_var[rt];
1855         int num_components = glsl_get_vector_elements(var->type);
1856         uint32_t conf = 0xffffff00;
1857         struct qinst *inst;
1858 
1859         conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1860                                             TLB_SAMPLE_MODE_PER_PIXEL;
1861         conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1862 
1863         if (c->fs_key->swap_color_rb & (1 << rt))
1864                 num_components = MAX2(num_components, 3);
1865         assert(num_components != 0);
1866 
1867         enum glsl_base_type type = glsl_get_base_type(var->type);
1868         bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1869         bool is_32b_tlb_format = is_int_format ||
1870                                  (c->fs_key->f32_color_rb & (1 << rt));
1871 
1872         if (is_int_format) {
1873                 /* The F32 vs I32 distinction was dropped in 4.2. */
1874                 if (c->devinfo->ver < 42)
1875                         conf |= TLB_TYPE_I32_COLOR;
1876                 else
1877                         conf |= TLB_TYPE_F32_COLOR;
1878                 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1879         } else {
1880                 if (c->fs_key->f32_color_rb & (1 << rt)) {
1881                         conf |= TLB_TYPE_F32_COLOR;
1882                         conf |= ((num_components - 1) <<
1883                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
1884                 } else {
1885                         conf |= TLB_TYPE_F16_COLOR;
1886                         conf |= TLB_F16_SWAP_HI_LO;
1887                         if (num_components >= 3)
1888                                 conf |= TLB_VEC_SIZE_4_F16;
1889                         else
1890                                 conf |= TLB_VEC_SIZE_2_F16;
1891                 }
1892         }
1893 
1894         int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1895         for (int i = 0; i < num_samples; i++) {
1896                 struct qreg *color = c->msaa_per_sample_output ?
1897                         &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1898                         &c->outputs[var->data.driver_location * 4];
1899 
1900                 struct qreg r = color[0];
1901                 struct qreg g = color[1];
1902                 struct qreg b = color[2];
1903                 struct qreg a = color[3];
1904 
1905                 if (c->fs_key->swap_color_rb & (1 << rt))  {
1906                         r = color[2];
1907                         b = color[0];
1908                 }
1909 
1910                 if (c->fs_key->sample_alpha_to_one)
1911                         a = vir_uniform_f(c, 1.0);
1912 
1913                 if (is_32b_tlb_format) {
1914                         if (i == 0) {
1915                                 inst = vir_MOV_dest(c, tlbu_reg, r);
1916                                 inst->uniform =
1917                                         vir_get_uniform_index(c,
1918                                                               QUNIFORM_CONSTANT,
1919                                                               conf);
1920                         } else {
1921                                 vir_MOV_dest(c, tlb_reg, r);
1922                         }
1923 
1924                         if (num_components >= 2)
1925                                 vir_MOV_dest(c, tlb_reg, g);
1926                         if (num_components >= 3)
1927                                 vir_MOV_dest(c, tlb_reg, b);
1928                         if (num_components >= 4)
1929                                 vir_MOV_dest(c, tlb_reg, a);
1930                 } else {
1931                         inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1932                         if (conf != ~0 && i == 0) {
1933                                 inst->dst = tlbu_reg;
1934                                 inst->uniform =
1935                                         vir_get_uniform_index(c,
1936                                                               QUNIFORM_CONSTANT,
1937                                                               conf);
1938                         }
1939 
1940                         if (num_components >= 3)
1941                                 vir_VFPACK_dest(c, tlb_reg, b, a);
1942                 }
1943         }
1944 }
1945 
1946 static void
emit_frag_end(struct v3d_compile * c)1947 emit_frag_end(struct v3d_compile *c)
1948 {
1949         if (c->output_sample_mask_index != -1) {
1950                 vir_SETMSF_dest(c, vir_nop_reg(),
1951                                 vir_AND(c,
1952                                         vir_MSF(c),
1953                                         c->outputs[c->output_sample_mask_index]));
1954         }
1955 
1956         bool has_any_tlb_color_write = false;
1957         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1958                 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1959                         has_any_tlb_color_write = true;
1960         }
1961 
1962         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1963                 struct nir_variable *var = c->output_color_var[0];
1964                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1965 
1966                 vir_SETMSF_dest(c, vir_nop_reg(),
1967                                 vir_AND(c,
1968                                         vir_MSF(c),
1969                                         vir_FTOC(c, color[3])));
1970         }
1971 
1972         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1973 
1974         /* If the shader has no non-TLB side effects and doesn't write Z
1975          * we can promote it to enabling early_fragment_tests even
1976          * if the user didn't.
1977          */
1978         if (c->output_position_index == -1 &&
1979             !(c->s->info.num_images || c->s->info.num_ssbos) &&
1980             !c->fs_key->sample_alpha_to_coverage &&
1981             c->output_sample_mask_index == -1 &&
1982             has_any_tlb_color_write) {
1983                 c->s->info.fs.early_fragment_tests =
1984                         !c->s->info.fs.uses_discard ||
1985                         c->fs_key->can_earlyz_with_discard;
1986         }
1987 
1988         /* By default, Z buffer writes are implicit using the Z values produced
1989          * from FEP (Z value produced from rasterization). When this is not
1990          * desirable (shader writes Z explicitly, has discards, etc) we need
1991          * to let the hardware know by setting c->writes_z to true, in which
1992          * case we always need to write a Z value from the QPU, even if it is
1993          * just the passthrough Z value produced from FEP.
1994          *
1995          * Also, from the V3D 4.2 spec:
1996          *
1997          * "If a shader performs a Z read the “Fragment shader does Z writes”
1998          *  bit in the shader record must be enabled to ensure deterministic
1999          *  results"
2000          *
2001          * So if c->reads_z is set we always need to write Z, even if it is
2002          * a passthrough from the Z value produced from FEP.
2003          */
2004         if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
2005                 c->writes_z = true;
2006                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
2007                 struct qinst *inst;
2008 
2009                 if (c->output_position_index != -1) {
2010                         /* Shader writes to gl_FragDepth, use that */
2011                         inst = vir_MOV_dest(c, tlbu_reg,
2012                                             c->outputs[c->output_position_index]);
2013 
2014                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
2015                                           TLB_SAMPLE_MODE_PER_PIXEL);
2016                 } else {
2017                         /* Shader doesn't write to gl_FragDepth, take Z from
2018                          * FEP.
2019                          */
2020                         c->writes_z_from_fep = true;
2021                         inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
2022 
2023                         /* The spec says the PER_PIXEL flag is ignored for
2024                          * invariant writes, but the simulator demands it.
2025                          */
2026                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
2027                                           TLB_SAMPLE_MODE_PER_PIXEL);
2028 
2029                         /* Since (single-threaded) fragment shaders always need
2030                          * a TLB write, if we dond't have any we emit a
2031                          * passthrouh Z and flag us as potentially discarding,
2032                          * so that we can use Z as the required TLB write.
2033                          */
2034                         if (!has_any_tlb_color_write)
2035                                 c->s->info.fs.uses_discard = true;
2036                 }
2037 
2038                 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
2039                                                       tlb_specifier |
2040                                                       0xffffff00);
2041                 inst->is_tlb_z_write = true;
2042         }
2043 
2044         /* XXX: Performance improvement: Merge Z write and color writes TLB
2045          * uniform setup
2046          */
2047         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
2048                 vir_emit_tlb_color_write(c, rt);
2049 }
2050 
2051 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)2052 vir_VPM_WRITE_indirect(struct v3d_compile *c,
2053                        struct qreg val,
2054                        struct qreg vpm_index,
2055                        bool uniform_vpm_index)
2056 {
2057         if (uniform_vpm_index)
2058                 vir_STVPMV(c, vpm_index, val);
2059         else
2060                 vir_STVPMD(c, vpm_index, val);
2061 }
2062 
2063 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)2064 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
2065 {
2066         vir_VPM_WRITE_indirect(c, val,
2067                                vir_uniform_ui(c, vpm_index), true);
2068 }
2069 
2070 static void
emit_vert_end(struct v3d_compile * c)2071 emit_vert_end(struct v3d_compile *c)
2072 {
2073         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2074          */
2075         if (c->devinfo->ver == 42)
2076                 vir_VPMWT(c);
2077 }
2078 
2079 static void
emit_geom_end(struct v3d_compile * c)2080 emit_geom_end(struct v3d_compile *c)
2081 {
2082         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2083          */
2084         if (c->devinfo->ver == 42)
2085                 vir_VPMWT(c);
2086 }
2087 
2088 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2089 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2090                        unsigned bit_size,
2091                        unsigned num_components,
2092                        int64_t hole_size,
2093                        nir_intrinsic_instr *low,
2094                        nir_intrinsic_instr *high,
2095                        void *data)
2096 {
2097         if (hole_size > 0 || !nir_num_components_valid(num_components))
2098                 return false;
2099 
2100         /* TMU general access only supports 32-bit vectors */
2101         if (bit_size > 32)
2102                 return false;
2103 
2104         if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2105                 return false;
2106 
2107         if (align_mul % 4 != 0 || align_offset % 4 != 0)
2108                 return false;
2109 
2110         /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2111          * if the resulting vector crosses a 16-byte boundary.
2112          */
2113         assert(util_is_power_of_two_nonzero(align_mul));
2114         align_mul = MIN2(align_mul, 16);
2115         align_offset &= 0xf;
2116         if (16 - align_mul + align_offset + num_components * 4 > 16)
2117                 return false;
2118 
2119         return true;
2120 }
2121 
2122 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2123 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2124 {
2125         bool progress;
2126         unsigned lower_flrp =
2127                 (s->options->lower_flrp16 ? 16 : 0) |
2128                 (s->options->lower_flrp32 ? 32 : 0) |
2129                 (s->options->lower_flrp64 ? 64 : 0);
2130 
2131         do {
2132                 progress = false;
2133 
2134                 NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
2135                 NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
2136                 NIR_PASS(progress, s, nir_opt_deref);
2137 
2138                 NIR_PASS(progress, s, nir_lower_vars_to_ssa);
2139                 if (!s->info.var_copies_lowered) {
2140                         /* Only run this pass if nir_lower_var_copies was not called
2141                          * yet. That would lower away any copy_deref instructions and we
2142                          * don't want to introduce any more.
2143                          */
2144                         NIR_PASS(progress, s, nir_opt_find_array_copies);
2145                 }
2146 
2147                 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
2148                 NIR_PASS(progress, s, nir_opt_dead_write_vars);
2149                 NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
2150 
2151                 NIR_PASS(progress, s, nir_remove_dead_variables,
2152                          (nir_variable_mode)(nir_var_function_temp |
2153                                              nir_var_shader_temp |
2154                                              nir_var_mem_shared),
2155                          NULL);
2156 
2157                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2158                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2159                 NIR_PASS(progress, s, nir_copy_prop);
2160                 NIR_PASS(progress, s, nir_opt_remove_phis);
2161                 NIR_PASS(progress, s, nir_opt_dce);
2162                 NIR_PASS(progress, s, nir_opt_dead_cf);
2163                 NIR_PASS(progress, s, nir_opt_cse);
2164                 /* before peephole_select as it can generate 64 bit bcsels */
2165                 NIR_PASS(progress, s, nir_lower_64bit_phis);
2166                 NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
2167                 NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
2168                 NIR_PASS(progress, s, nir_opt_algebraic);
2169                 NIR_PASS(progress, s, nir_opt_constant_folding);
2170 
2171                 NIR_PASS(progress, s, nir_opt_intrinsics);
2172                 NIR_PASS(progress, s, nir_opt_idiv_const, 32);
2173                 NIR_PASS(progress, s, nir_lower_alu);
2174 
2175                 if (nir_opt_loop(s)) {
2176                    progress = true;
2177                    NIR_PASS(progress, s, nir_copy_prop);
2178                    NIR_PASS(progress, s, nir_opt_dce);
2179                 }
2180 
2181                 NIR_PASS(progress, s, nir_opt_conditional_discard);
2182 
2183                 NIR_PASS(progress, s, nir_opt_remove_phis);
2184                 NIR_PASS(progress, s, nir_opt_if, false);
2185                 if (c && !c->disable_gcm) {
2186                         bool local_progress = false;
2187                         NIR_PASS(local_progress, s, nir_opt_gcm, false);
2188                         c->gcm_progress |= local_progress;
2189                         progress |= local_progress;
2190                 }
2191 
2192                 /* Note that vectorization may undo the load/store scalarization
2193                  * pass we run for non 32-bit TMU general load/store by
2194                  * converting, for example, 2 consecutive 16-bit loads into a
2195                  * single 32-bit load. This is fine (and desirable) as long as
2196                  * the resulting 32-bit load meets 32-bit alignment requirements,
2197                  * which mem_vectorize_callback() should be enforcing.
2198                  */
2199                 nir_load_store_vectorize_options vectorize_opts = {
2200                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2201                                  nir_var_mem_push_const | nir_var_mem_shared |
2202                                  nir_var_mem_global,
2203                         .callback = mem_vectorize_callback,
2204                         .robust_modes = 0,
2205                 };
2206                 bool vectorize_progress = false;
2207 
2208 
2209                 /* This requires that we have called
2210                  * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
2211                  * first, which we may not have done yet if we call here too
2212                  * early durign NIR pre-processing. We can detect this because
2213                  * in that case we won't have a compile object
2214                  */
2215                 if (c) {
2216                         NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2217                                  &vectorize_opts);
2218                         if (vectorize_progress) {
2219                                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2220                                 NIR_PASS(progress, s, nir_lower_pack);
2221                                 progress = true;
2222                         }
2223                 }
2224 
2225                 if (lower_flrp != 0) {
2226                         bool lower_flrp_progress = false;
2227 
2228                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2229                                  lower_flrp,
2230                                  false /* always_precise */);
2231                         if (lower_flrp_progress) {
2232                                 NIR_PASS(progress, s, nir_opt_constant_folding);
2233                                 progress = true;
2234                         }
2235 
2236                         /* Nothing should rematerialize any flrps, so we only
2237                          * need to do this lowering once.
2238                          */
2239                         lower_flrp = 0;
2240                 }
2241 
2242                 NIR_PASS(progress, s, nir_opt_undef);
2243                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2244 
2245                 if (c && !c->disable_loop_unrolling &&
2246                     s->options->max_unroll_iterations > 0) {
2247                        bool local_progress = false;
2248                        NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2249                        c->unrolled_any_loops |= local_progress;
2250                        progress |= local_progress;
2251                 }
2252         } while (progress);
2253 
2254         /* needs to be outside of optimization loop, otherwise it fights with
2255          * opt_algebraic optimizing the conversion lowering
2256          */
2257         NIR_PASS(progress, s, v3d_nir_lower_algebraic, c);
2258         NIR_PASS(progress, s, nir_opt_cse);
2259 
2260         nir_move_options sink_opts =
2261                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2262                 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2263         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2264 }
2265 
2266 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2267 driver_location_compare(const nir_variable *a, const nir_variable *b)
2268 {
2269         return a->data.driver_location == b->data.driver_location ?
2270                a->data.location_frac - b->data.location_frac :
2271                a->data.driver_location - b->data.driver_location;
2272 }
2273 
2274 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t num_components)2275 ntq_emit_vpm_read(struct v3d_compile *c, uint32_t num_components)
2276 {
2277         return vir_LDVPMV_IN(c,
2278                              vir_uniform_ui(c, num_components));
2279 }
2280 
2281 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2282 ntq_setup_vs_inputs(struct v3d_compile *c)
2283 {
2284         /* Figure out how many components of each vertex attribute the shader
2285          * uses.  Each variable should have been split to individual
2286          * components and unused ones DCEed.  The vertex fetcher will load
2287          * from the start of the attribute to the number of components we
2288          * declare we need in c->vattr_sizes[].
2289          *
2290          * BGRA vertex attributes are a bit special: since we implement these
2291          * as RGBA swapping R/B components we always need at least 3 components
2292          * if component 0 is read.
2293          */
2294         nir_foreach_shader_in_variable(var, c->s) {
2295                 /* No VS attribute array support. */
2296                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2297 
2298                 unsigned loc = var->data.driver_location;
2299                 int start_component = var->data.location_frac;
2300                 int num_components = glsl_get_components(var->type);
2301 
2302                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2303                                            start_component + num_components);
2304 
2305                 /* Handle BGRA inputs */
2306                 if (start_component == 0 &&
2307                     c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2308                         c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2309                 }
2310         }
2311 
2312         uint32_t vpm_components = 0;
2313         bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2314                                     SYSTEM_VALUE_INSTANCE_ID) ||
2315                         BITSET_TEST(c->s->info.system_values_read,
2316                                     SYSTEM_VALUE_INSTANCE_INDEX);
2317         bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2318                                      SYSTEM_VALUE_BASE_INSTANCE);
2319         bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2320                                     SYSTEM_VALUE_VERTEX_ID) ||
2321                         BITSET_TEST(c->s->info.system_values_read,
2322                                     SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2323 
2324         if (uses_iid)
2325                 c->iid = ntq_emit_vpm_read(c, vpm_components++);
2326 
2327         if (uses_biid)
2328                 c->biid = ntq_emit_vpm_read(c, vpm_components++);
2329 
2330         if (uses_vid)
2331                 c->vid = ntq_emit_vpm_read(c, vpm_components++);
2332 
2333         /* The actual loads will happen directly in nir_intrinsic_load_input
2334          */
2335         return;
2336 }
2337 
2338 static bool
program_reads_point_coord(struct v3d_compile * c)2339 program_reads_point_coord(struct v3d_compile *c)
2340 {
2341         nir_foreach_shader_in_variable(var, c->s) {
2342                 if (util_varying_is_point_coord(var->data.location,
2343                                                 c->fs_key->point_sprite_mask)) {
2344                         return true;
2345                 }
2346         }
2347 
2348         return false;
2349 }
2350 
2351 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2352 ntq_setup_gs_inputs(struct v3d_compile *c)
2353 {
2354         nir_sort_variables_with_modes(c->s, driver_location_compare,
2355                                       nir_var_shader_in);
2356 
2357         nir_foreach_shader_in_variable(var, c->s) {
2358                 /* All GS inputs are arrays with as many entries as vertices
2359                  * in the input primitive, but here we only care about the
2360                  * per-vertex input type.
2361                  */
2362                 assert(glsl_type_is_array(var->type));
2363                 const struct glsl_type *type = glsl_get_array_element(var->type);
2364                 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2365                 unsigned loc = var->data.driver_location;
2366 
2367                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2368                                   (loc + var_len) * 4);
2369 
2370                 if (var->data.compact) {
2371                         for (unsigned j = 0; j < var_len; j++) {
2372                                 unsigned input_idx = c->num_inputs++;
2373                                 unsigned loc_frac = var->data.location_frac + j;
2374                                 unsigned loc = var->data.location + loc_frac / 4;
2375                                 unsigned comp = loc_frac % 4;
2376                                 c->input_slots[input_idx] =
2377                                         v3d_slot_from_slot_and_component(loc, comp);
2378                         }
2379                        continue;
2380                 }
2381 
2382                 for (unsigned j = 0; j < var_len; j++) {
2383                         unsigned num_elements =
2384                                 glsl_type_is_struct(glsl_without_array(type)) ?
2385                                 4 : glsl_get_vector_elements(type);
2386                         for (unsigned k = 0; k < num_elements; k++) {
2387                                 unsigned chan = var->data.location_frac + k;
2388                                 unsigned input_idx = c->num_inputs++;
2389                                 struct v3d_varying_slot slot =
2390                                         v3d_slot_from_slot_and_component(var->data.location + j, chan);
2391                                 c->input_slots[input_idx] = slot;
2392                         }
2393                 }
2394         }
2395 }
2396 
2397 
2398 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2399 ntq_setup_fs_inputs(struct v3d_compile *c)
2400 {
2401         nir_sort_variables_with_modes(c->s, driver_location_compare,
2402                                       nir_var_shader_in);
2403 
2404         nir_foreach_shader_in_variable(var, c->s) {
2405                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2406                 unsigned loc = var->data.driver_location;
2407 
2408                 uint32_t inputs_array_size = c->inputs_array_size;
2409                 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2410                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2411                                   inputs_array_required_size);
2412                 resize_interp_array(c, &c->interp, &inputs_array_size,
2413                                     inputs_array_required_size);
2414 
2415                 if (var->data.location == VARYING_SLOT_POS) {
2416                         emit_fragcoord_input(c, loc);
2417                 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2418                            !c->fs_key->has_gs) {
2419                         /* If the fragment shader reads gl_PrimitiveID and we
2420                          * don't have a geometry shader in the pipeline to write
2421                          * it then we program the hardware to inject it as
2422                          * an implicit varying. Take it from there.
2423                          */
2424                         c->inputs[loc * 4] = c->primitive_id;
2425                 } else if (util_varying_is_point_coord(var->data.location,
2426                                                        c->fs_key->point_sprite_mask)) {
2427                         c->inputs[loc * 4 + 0] = c->point_x;
2428                         c->inputs[loc * 4 + 1] = c->point_y;
2429                 } else if (var->data.compact) {
2430                         for (int j = 0; j < var_len; j++)
2431                                 emit_compact_fragment_input(c, loc, var, j);
2432                 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2433                         for (int j = 0; j < var_len; j++) {
2434                            emit_fragment_input(c, loc, var, j, 4);
2435                         }
2436                 } else {
2437                         for (int j = 0; j < var_len; j++) {
2438                                 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2439                         }
2440                 }
2441         }
2442 }
2443 
2444 static void
ntq_setup_outputs(struct v3d_compile * c)2445 ntq_setup_outputs(struct v3d_compile *c)
2446 {
2447         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2448                 return;
2449 
2450         nir_foreach_shader_out_variable(var, c->s) {
2451                 assert(glsl_type_is_vector_or_scalar(var->type));
2452                 unsigned loc = var->data.driver_location * 4;
2453 
2454                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2455                         add_output(c, loc + var->data.location_frac + i,
2456                                    var->data.location,
2457                                    var->data.location_frac + i);
2458                 }
2459 
2460                 switch (var->data.location) {
2461                 case FRAG_RESULT_COLOR:
2462                         for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
2463                                 c->output_color_var[i] = var;
2464                         break;
2465                 case FRAG_RESULT_DATA0:
2466                 case FRAG_RESULT_DATA1:
2467                 case FRAG_RESULT_DATA2:
2468                 case FRAG_RESULT_DATA3:
2469                 case FRAG_RESULT_DATA4:
2470                 case FRAG_RESULT_DATA5:
2471                 case FRAG_RESULT_DATA6:
2472                 case FRAG_RESULT_DATA7:
2473                         c->output_color_var[var->data.location -
2474                                             FRAG_RESULT_DATA0] = var;
2475                         break;
2476                 case FRAG_RESULT_DEPTH:
2477                         c->output_position_index = loc;
2478                         break;
2479                 case FRAG_RESULT_SAMPLE_MASK:
2480                         c->output_sample_mask_index = loc;
2481                         break;
2482                 }
2483         }
2484 }
2485 
2486 /**
2487  * Sets up the mapping from nir_register to struct qreg *.
2488  *
2489  * Each nir_register gets a struct qreg per 32-bit component being stored.
2490  */
2491 static void
ntq_setup_registers(struct v3d_compile * c,nir_function_impl * impl)2492 ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
2493 {
2494         nir_foreach_reg_decl(decl, impl) {
2495                 unsigned num_components = nir_intrinsic_num_components(decl);
2496                 unsigned array_len = nir_intrinsic_num_array_elems(decl);
2497                 array_len = MAX2(array_len, 1);
2498                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2499                                                   array_len * num_components);
2500 
2501                 nir_def *nir_reg = &decl->def;
2502                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2503 
2504                 for (int i = 0; i < array_len * num_components; i++)
2505                         qregs[i] = vir_get_temp(c);
2506         }
2507 }
2508 
2509 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2510 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2511 {
2512         /* XXX perf: Experiment with using immediate loads to avoid having
2513          * these end up in the uniform stream.  Watch out for breaking the
2514          * small immediates optimization in the process!
2515          */
2516         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2517         for (int i = 0; i < instr->def.num_components; i++)
2518                 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2519 
2520         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2521 }
2522 
2523 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2524 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2525 {
2526         unsigned image_index = nir_src_as_uint(instr->src[0]);
2527         bool is_array = nir_intrinsic_image_array(instr);
2528 
2529         assert(nir_src_as_uint(instr->src[1]) == 0);
2530 
2531         ntq_store_def(c, &instr->def, 0,
2532                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2533         if (instr->num_components > 1) {
2534                 ntq_store_def(c, &instr->def, 1,
2535                               vir_uniform(c,
2536                                           instr->num_components == 2 && is_array ?
2537                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
2538                                                   QUNIFORM_IMAGE_HEIGHT,
2539                                           image_index));
2540         }
2541         if (instr->num_components > 2) {
2542                 ntq_store_def(c, &instr->def, 2,
2543                               vir_uniform(c,
2544                                           is_array ?
2545                                           QUNIFORM_IMAGE_ARRAY_SIZE :
2546                                           QUNIFORM_IMAGE_DEPTH,
2547                                           image_index));
2548         }
2549 }
2550 
2551 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2552 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2553 {
2554         assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2555 
2556         int rt = nir_src_as_uint(instr->src[0]);
2557         assert(rt < V3D_MAX_DRAW_BUFFERS);
2558 
2559         int sample_index = nir_intrinsic_base(instr) ;
2560         assert(sample_index < V3D_MAX_SAMPLES);
2561 
2562         int component = nir_intrinsic_component(instr);
2563         assert(component < 4);
2564 
2565         /* We need to emit our TLB reads after we have acquired the scoreboard
2566          * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2567          * the last thread switch to improve parallelism, however, that is only
2568          * guaranteed to happen before the tlb color writes.
2569          *
2570          * To fix that, we make sure we always emit a thread switch before the
2571          * first tlb color read. If that happens to be the last thread switch
2572          * we emit, then everything is fine, but otherwise, if any code after
2573          * this point needs to emit additional thread switches, then we will
2574          * switch the strategy to locking the scoreboard on the first thread
2575          * switch instead -- see vir_emit_thrsw().
2576          */
2577         if (!c->emitted_tlb_load) {
2578                 if (!c->last_thrsw_at_top_level)
2579                         vir_emit_thrsw(c);
2580 
2581                 c->emitted_tlb_load = true;
2582         }
2583 
2584         struct qreg *color_reads_for_sample =
2585                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2586 
2587         if (color_reads_for_sample[component].file == QFILE_NULL) {
2588                 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2589                 int num_components =
2590                         util_format_get_nr_components(rt_format);
2591 
2592                 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2593                 if (swap_rb)
2594                         num_components = MAX2(num_components, 3);
2595 
2596                 nir_variable *var = c->output_color_var[rt];
2597                 enum glsl_base_type type = glsl_get_base_type(var->type);
2598 
2599                 bool is_int_format = type == GLSL_TYPE_INT ||
2600                                      type == GLSL_TYPE_UINT;
2601 
2602                 bool is_32b_tlb_format = is_int_format ||
2603                                          (c->fs_key->f32_color_rb & (1 << rt));
2604 
2605                 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2606 
2607                 uint32_t conf = 0xffffff00;
2608                 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2609                                           TLB_SAMPLE_MODE_PER_PIXEL;
2610                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2611 
2612                 if (is_32b_tlb_format) {
2613                         /* The F32 vs I32 distinction was dropped in 4.2. */
2614                         conf |= (c->devinfo->ver < 42 && is_int_format) ?
2615                                 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2616 
2617                         conf |= ((num_components - 1) <<
2618                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
2619                 } else {
2620                         conf |= TLB_TYPE_F16_COLOR;
2621                         conf |= TLB_F16_SWAP_HI_LO;
2622 
2623                         if (num_components >= 3)
2624                                 conf |= TLB_VEC_SIZE_4_F16;
2625                         else
2626                                 conf |= TLB_VEC_SIZE_2_F16;
2627                 }
2628 
2629 
2630                 for (int i = 0; i < num_samples; i++) {
2631                         struct qreg r, g, b, a;
2632                         if (is_32b_tlb_format) {
2633                                 r = conf != 0xffffffff && i == 0?
2634                                         vir_TLBU_COLOR_READ(c, conf) :
2635                                         vir_TLB_COLOR_READ(c);
2636                                 if (num_components >= 2)
2637                                         g = vir_TLB_COLOR_READ(c);
2638                                 if (num_components >= 3)
2639                                         b = vir_TLB_COLOR_READ(c);
2640                                 if (num_components >= 4)
2641                                         a = vir_TLB_COLOR_READ(c);
2642                         } else {
2643                                 struct qreg rg = conf != 0xffffffff && i == 0 ?
2644                                         vir_TLBU_COLOR_READ(c, conf) :
2645                                         vir_TLB_COLOR_READ(c);
2646                                 r = vir_FMOV(c, rg);
2647                                 vir_set_unpack(c->defs[r.index], 0,
2648                                                V3D_QPU_UNPACK_L);
2649                                 g = vir_FMOV(c, rg);
2650                                 vir_set_unpack(c->defs[g.index], 0,
2651                                                V3D_QPU_UNPACK_H);
2652 
2653                                 if (num_components > 2) {
2654                                     struct qreg ba = vir_TLB_COLOR_READ(c);
2655                                     b = vir_FMOV(c, ba);
2656                                     vir_set_unpack(c->defs[b.index], 0,
2657                                                    V3D_QPU_UNPACK_L);
2658                                     a = vir_FMOV(c, ba);
2659                                     vir_set_unpack(c->defs[a.index], 0,
2660                                                    V3D_QPU_UNPACK_H);
2661                                 }
2662                         }
2663 
2664                         struct qreg *color_reads =
2665                                 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2666 
2667                         color_reads[0] = swap_rb ? b : r;
2668                         if (num_components >= 2)
2669                                 color_reads[1] = g;
2670                         if (num_components >= 3)
2671                                 color_reads[2] = swap_rb ? r : b;
2672                         if (num_components >= 4)
2673                                 color_reads[3] = a;
2674                 }
2675         }
2676 
2677         assert(color_reads_for_sample[component].file != QFILE_NULL);
2678         ntq_store_def(c, &instr->def, 0,
2679                       vir_MOV(c, color_reads_for_sample[component]));
2680 }
2681 
2682 static bool
2683 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2684 
2685 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_def * def,enum quniform_contents contents)2686 try_emit_uniform(struct v3d_compile *c,
2687                  int offset,
2688                  int num_components,
2689                  nir_def *def,
2690                  enum quniform_contents contents)
2691 {
2692         /* Even though ldunif is strictly 32-bit we can still use it
2693          * to load scalar 8-bit/16-bit uniforms so long as their offset
2694          * is 32-bit aligned. In this case, ldunif would still load
2695          * 32-bit into the destination with the 8-bit/16-bit uniform
2696          * data in the LSB and garbage in the MSB, but that is fine
2697          * because we should only be accessing the valid bits of the
2698          * destination.
2699          *
2700          * FIXME: if in the future we improve our register allocator to
2701          * pack 2 16-bit variables in the MSB and LSB of the same
2702          * register then this optimization would not be valid as is,
2703          * since the load clobbers the MSB.
2704          */
2705         if (offset % 4 != 0)
2706                 return false;
2707 
2708         /* We need dwords */
2709         offset = offset / 4;
2710 
2711         for (int i = 0; i < num_components; i++) {
2712                 ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
2713         }
2714 
2715         return true;
2716 }
2717 
2718 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2719 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2720 {
2721         /* We scalarize general TMU access for anything that is not 32-bit. */
2722         assert(instr->def.bit_size == 32 ||
2723                instr->num_components == 1);
2724 
2725         /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2726         if (nir_src_is_const(instr->src[0])) {
2727                 int offset = (nir_intrinsic_base(instr) +
2728                              nir_src_as_uint(instr->src[0]));
2729 
2730                 if (try_emit_uniform(c, offset, instr->num_components,
2731                                      &instr->def, QUNIFORM_UNIFORM)) {
2732                         return;
2733                 }
2734         }
2735 
2736         if (!ntq_emit_load_unifa(c, instr)) {
2737                 ntq_emit_tmu_general(c, instr, false, false);
2738                 c->has_general_tmu_load = true;
2739         }
2740 }
2741 
2742 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2743 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2744 {
2745         if (c->compiler->max_inline_uniform_buffers <= 0)
2746                 return false;
2747 
2748         /* Regular UBOs start after inline UBOs */
2749         uint32_t index = nir_src_as_uint(instr->src[0]);
2750         if (index >= c->compiler->max_inline_uniform_buffers)
2751                 return false;
2752 
2753         /* We scalarize general TMU access for anything that is not 32-bit */
2754         assert(instr->def.bit_size == 32 ||
2755                instr->num_components == 1);
2756 
2757         if (nir_src_is_const(instr->src[1])) {
2758                 int offset = nir_src_as_uint(instr->src[1]);
2759                 if (try_emit_uniform(c, offset, instr->num_components,
2760                                      &instr->def,
2761                                      QUNIFORM_INLINE_UBO_0 + index)) {
2762                         return true;
2763                 }
2764         }
2765 
2766         /* Fallback to regular UBO load */
2767         return false;
2768 }
2769 
2770 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2771 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2772 {
2773         /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2774          *
2775          * Right now the driver sets support_indirect_inputs even
2776          * if we don't support non-uniform offsets because we also set the
2777          * lower_all_io_to_temps option in the NIR compiler. This ensures that
2778          * any indirect indexing on in/out variables is turned into indirect
2779          * indexing on temporary variables instead, that we handle by lowering
2780          * to scratch. If we implement non-uniform offset here we might be able
2781          * to avoid the temp and scratch lowering, which involves copying from
2782          * the input to the temp variable, possibly making code more optimal.
2783          */
2784         unsigned offset =
2785                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2786 
2787         if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2788                /* Emit the LDVPM directly now, rather than at the top
2789                 * of the shader like we did for V3D 3.x (which needs
2790                 * vpmsetup when not just taking the next offset).
2791                 *
2792                 * Note that delaying like this may introduce stalls,
2793                 * as LDVPMV takes a minimum of 1 instruction but may
2794                 * be slower if the VPM unit is busy with another QPU.
2795                 */
2796                int index = 0;
2797                if (BITSET_TEST(c->s->info.system_values_read,
2798                                SYSTEM_VALUE_INSTANCE_ID)) {
2799                       index++;
2800                }
2801                if (BITSET_TEST(c->s->info.system_values_read,
2802                                SYSTEM_VALUE_BASE_INSTANCE)) {
2803                       index++;
2804                }
2805                if (BITSET_TEST(c->s->info.system_values_read,
2806                                SYSTEM_VALUE_VERTEX_ID)) {
2807                       index++;
2808                }
2809 
2810                for (int i = 0; i < offset; i++) {
2811                       /* GFXH-1602: if any builtins (vid, iid, etc) are read then
2812                        * attribute 0 must be active (size > 0). When we hit this,
2813                        * the driver is expected to program attribute 0 to have a
2814                        * size of 1, so here we need to add that.
2815                        */
2816                       if (i == 0 && c->vs_key->is_coord &&
2817                           c->vattr_sizes[i] == 0 && index > 0) {
2818                          index++;
2819                       } else {
2820                          index += c->vattr_sizes[i];
2821                       }
2822                }
2823 
2824                index += nir_intrinsic_component(instr);
2825                for (int i = 0; i < instr->num_components; i++) {
2826                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
2827                       ntq_store_def(c, &instr->def, i,
2828                                     vir_LDVPMV_IN(c, vpm_offset));
2829                 }
2830         } else {
2831                 for (int i = 0; i < instr->num_components; i++) {
2832                         int comp = nir_intrinsic_component(instr) + i;
2833                         struct qreg input = c->inputs[offset * 4 + comp];
2834                         ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
2835 
2836                         if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2837                             input.file == c->payload_z.file &&
2838                             input.index == c->payload_z.index) {
2839                                 c->reads_z = true;
2840                         }
2841                 }
2842         }
2843 }
2844 
2845 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2846 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2847                                 nir_intrinsic_instr *instr)
2848 {
2849         assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2850 
2851         unsigned rt = nir_src_as_uint(instr->src[1]);
2852         assert(rt < V3D_MAX_DRAW_BUFFERS);
2853 
2854         unsigned sample_idx = nir_intrinsic_base(instr);
2855         assert(sample_idx < V3D_MAX_SAMPLES);
2856 
2857         unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2858         for (int i = 0; i < instr->num_components; i++) {
2859                 c->sample_colors[offset + i] =
2860                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2861         }
2862 }
2863 
2864 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2865 ntq_emit_color_write(struct v3d_compile *c,
2866                      nir_intrinsic_instr *instr)
2867 {
2868         unsigned offset = (nir_intrinsic_base(instr) +
2869                            nir_src_as_uint(instr->src[1])) * 4 +
2870                           nir_intrinsic_component(instr);
2871         for (int i = 0; i < instr->num_components; i++) {
2872                 c->outputs[offset + i] =
2873                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2874         }
2875 }
2876 
2877 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2878 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2879 {
2880         assert(instr->num_components == 1);
2881 
2882         struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2883 
2884         uint32_t base_offset = nir_intrinsic_base(instr);
2885 
2886         if (base_offset)
2887                 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2888 
2889         /* Usually, for VS or FS, we only emit outputs once at program end so
2890          * our VPM writes are never in non-uniform control flow, but this
2891          * is not true for GS, where we are emitting multiple vertices.
2892          */
2893         if (vir_in_nonuniform_control_flow(c)) {
2894                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2895                            V3D_QPU_PF_PUSHZ);
2896         }
2897 
2898         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2899 
2900         /* The offset isn’t necessarily dynamically uniform for a geometry
2901          * shader. This can happen if the shader sometimes doesn’t emit one of
2902          * the vertices. In that case subsequent vertices will be written to
2903          * different offsets in the VPM and we need to use the scatter write
2904          * instruction to have a different offset for each lane.
2905          */
2906          bool is_uniform_offset =
2907                  !vir_in_nonuniform_control_flow(c) &&
2908                  !nir_src_is_divergent(&instr->src[1]);
2909          vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2910 
2911         if (vir_in_nonuniform_control_flow(c)) {
2912                 struct qinst *last_inst =
2913                         (struct qinst *)c->cur_block->instructions.prev;
2914                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2915         }
2916 }
2917 
2918 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2919 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2920 {
2921         assert(c->s->info.stage == MESA_SHADER_VERTEX);
2922         assert(instr->num_components == 1);
2923 
2924         uint32_t base = nir_intrinsic_base(instr);
2925         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2926 
2927         if (nir_src_is_const(instr->src[1])) {
2928                 vir_VPM_WRITE(c, val,
2929                               base + nir_src_as_uint(instr->src[1]));
2930         } else {
2931                 struct qreg offset = vir_ADD(c,
2932                                              ntq_get_src(c, instr->src[1], 1),
2933                                              vir_uniform_ui(c, base));
2934                 bool is_uniform_offset =
2935                         !vir_in_nonuniform_control_flow(c) &&
2936                         !nir_src_is_divergent(&instr->src[1]);
2937                 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2938         }
2939 }
2940 
2941 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2942 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2943 {
2944         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2945                ntq_emit_color_write(c, instr);
2946         else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2947                emit_store_output_gs(c, instr);
2948         else
2949                emit_store_output_vs(c, instr);
2950 }
2951 
2952 /**
2953  * This implementation is based on v3d_sample_{x,y}_offset() from
2954  * v3d_sample_offset.h.
2955  */
2956 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2957 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2958                       struct qreg *sx, struct qreg *sy)
2959 {
2960         sample_idx = vir_ITOF(c, sample_idx);
2961 
2962         struct qreg offset_x =
2963                 vir_FADD(c, vir_uniform_f(c, -0.125f),
2964                             vir_FMUL(c, sample_idx,
2965                                         vir_uniform_f(c, 0.5f)));
2966         vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2967                                     vir_uniform_f(c, 2.0f), sample_idx),
2968                    V3D_QPU_PF_PUSHC);
2969         offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2970                               vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2971                               offset_x);
2972 
2973         struct qreg offset_y =
2974                    vir_FADD(c, vir_uniform_f(c, -0.375f),
2975                                vir_FMUL(c, sample_idx,
2976                                            vir_uniform_f(c, 0.25f)));
2977         *sx = offset_x;
2978         *sy = offset_y;
2979 }
2980 
2981 /**
2982  * This implementation is based on get_centroid_offset() from fep.c.
2983  */
2984 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2985 ntq_get_barycentric_centroid(struct v3d_compile *c,
2986                              struct qreg *out_x,
2987                              struct qreg *out_y)
2988 {
2989         struct qreg sample_mask;
2990         if (c->output_sample_mask_index != -1)
2991                 sample_mask = c->outputs[c->output_sample_mask_index];
2992         else
2993                 sample_mask = vir_MSF(c);
2994 
2995         struct qreg i0 = vir_uniform_ui(c, 0);
2996         struct qreg i1 = vir_uniform_ui(c, 1);
2997         struct qreg i2 = vir_uniform_ui(c, 2);
2998         struct qreg i3 = vir_uniform_ui(c, 3);
2999         struct qreg i4 = vir_uniform_ui(c, 4);
3000         struct qreg i8 = vir_uniform_ui(c, 8);
3001 
3002         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
3003         struct qreg F = vir_uniform_ui(c, 0);
3004         struct qreg T = vir_uniform_ui(c, ~0);
3005         struct qreg s0 = vir_AND(c, sample_mask, i1);
3006         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3007         s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3008         struct qreg s1 = vir_AND(c, sample_mask, i2);
3009         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3010         s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3011         struct qreg s2 = vir_AND(c, sample_mask, i4);
3012         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3013         s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3014         struct qreg s3 = vir_AND(c, sample_mask, i8);
3015         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
3016         s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3017 
3018         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
3019         struct qreg sample_idx = i3;
3020         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3021         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
3022         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3023         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
3024         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3025         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
3026 
3027         /* Get offset at selected sample index */
3028         struct qreg offset_x, offset_y;
3029         ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3030 
3031         /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
3032          * are selected.
3033          */
3034         struct qreg s0_and_s3 = vir_AND(c, s0, s3);
3035         struct qreg s1_and_s2 = vir_AND(c, s1, s2);
3036 
3037         struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
3038         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3039         use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
3040         use_center = vir_OR(c, use_center, s0_and_s3);
3041         use_center = vir_OR(c, use_center, s1_and_s2);
3042 
3043         struct qreg zero = vir_uniform_f(c, 0.0f);
3044         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3045         offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
3046         offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
3047 
3048         *out_x = offset_x;
3049         *out_y = offset_y;
3050 }
3051 
3052 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)3053 ntq_emit_load_interpolated_input(struct v3d_compile *c,
3054                                  struct qreg p,
3055                                  struct qreg C,
3056                                  struct qreg offset_x,
3057                                  struct qreg offset_y,
3058                                  unsigned mode)
3059 {
3060         if (mode == INTERP_MODE_FLAT)
3061                 return C;
3062 
3063         struct qreg sample_offset_x =
3064                 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3065         struct qreg sample_offset_y =
3066                 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3067 
3068         struct qreg scaleX =
3069                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
3070                             offset_x);
3071         struct qreg scaleY =
3072                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
3073                             offset_y);
3074 
3075         struct qreg pInterp =
3076                 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3077                                            vir_FMUL(c, vir_FDY(c, p), scaleY)));
3078 
3079         if (mode == INTERP_MODE_NOPERSPECTIVE)
3080                 return vir_FADD(c, pInterp, C);
3081 
3082         struct qreg w = c->payload_w;
3083         struct qreg wInterp =
3084                 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3085                                            vir_FMUL(c, vir_FDY(c, w), scaleY)));
3086 
3087         return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3088 }
3089 
3090 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3091 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3092 {
3093         struct qinst *ldunifa =
3094                 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3095         ldunifa->qpu.sig.ldunifa = true;
3096         if (result)
3097                 *result = vir_emit_def(c, ldunifa);
3098         else
3099                 vir_emit_nondef(c, ldunifa);
3100         c->current_unifa_offset += 4;
3101 }
3102 
3103 /* Checks if the value of a nir src is derived from a nir register */
3104 static bool
nir_src_derived_from_reg(nir_src src)3105 nir_src_derived_from_reg(nir_src src)
3106 {
3107         nir_def *def = src.ssa;
3108         if (nir_load_reg_for_def(def))
3109                 return true;
3110 
3111         nir_instr *parent = def->parent_instr;
3112         switch (parent->type) {
3113         case nir_instr_type_alu: {
3114                 nir_alu_instr *alu = nir_instr_as_alu(parent);
3115                 int num_srcs = nir_op_infos[alu->op].num_inputs;
3116                 for (int i = 0; i < num_srcs; i++) {
3117                         if (nir_src_derived_from_reg(alu->src[i].src))
3118                                 return true;
3119                 }
3120                 return false;
3121         }
3122         case nir_instr_type_intrinsic: {
3123                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3124                 int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
3125                 for (int i = 0; i < num_srcs; i++) {
3126                         if (nir_src_derived_from_reg(intr->src[i]))
3127                                 return true;
3128                 }
3129                 return false;
3130         }
3131         case nir_instr_type_load_const:
3132         case nir_instr_type_undef:
3133                 return false;
3134         default:
3135                 /* By default we assume it may come from a register, the above
3136                  * cases should be able to handle the majority of situations
3137                  * though.
3138                  */
3139                 return true;
3140         };
3141 }
3142 
3143 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3144 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3145 {
3146         assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3147                instr->intrinsic == nir_intrinsic_load_ssbo ||
3148                instr->intrinsic == nir_intrinsic_load_uniform);
3149 
3150         bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3151         bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3152         bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3153 
3154         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3155          * current unifa offset is 4 bytes ahead of the offset of the last load.
3156          */
3157         static const int32_t max_unifa_skip_dist =
3158                 MAX_UNIFA_SKIP_DISTANCE - 4;
3159 
3160         /* We can only use unifa if the offset is uniform */
3161         nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3162         if (nir_src_is_divergent(&offset))
3163                 return false;
3164 
3165         /* Emitting loads from unifa may not be safe under non-uniform control
3166          * flow. It seems the address that is used to write to the unifa
3167          * register is taken from the first lane and if that lane is disabled
3168          * by control flow then the value we read may be bogus and lead to
3169          * invalid memory accesses on follow-up ldunifa instructions. However,
3170          * ntq_store_def only emits conditional writes for nir registersas long
3171          * we can be certain that the offset isn't derived from a load_reg we
3172          * should be fine.
3173          *
3174          * The following CTS test can be used to trigger the problem, which
3175          * causes a GMP violations in the sim without this check:
3176          * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
3177          */
3178         if (vir_in_nonuniform_control_flow(c) &&
3179             nir_src_derived_from_reg(offset)) {
3180                 return false;
3181         }
3182 
3183         /* We can only use unifa with SSBOs if they are read-only. Otherwise
3184          * ldunifa won't see the shader writes to that address (possibly
3185          * because ldunifa doesn't read from the L2T cache).
3186          */
3187         if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3188                 return false;
3189 
3190         /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3191          * that we may have been written to scratch using the TMU.
3192          */
3193         bool dynamic_src = !nir_src_is_const(offset);
3194         if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3195                 return false;
3196 
3197         uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3198         if (is_uniform)
3199                 const_offset += nir_intrinsic_base(instr);
3200 
3201         /* ldunifa is a 32-bit load instruction so we can only use it with
3202          * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3203          * except for types smaller than 32-bit, so in these cases we can only
3204          * use ldunifa if we can verify alignment, which we can only do for
3205          * loads with a constant offset.
3206          */
3207         uint32_t bit_size = instr->def.bit_size;
3208         uint32_t value_skips = 0;
3209         if (bit_size < 32) {
3210                 if (dynamic_src) {
3211                         return false;
3212                 } else if (const_offset % 4 != 0) {
3213                         /* If we are loading from an unaligned offset, fix
3214                          * alignment and skip over unused elements in result.
3215                          */
3216                         value_skips = (const_offset % 4) / (bit_size / 8);
3217                         const_offset &= ~0x3;
3218                 }
3219         }
3220 
3221         assert((bit_size == 32 && value_skips == 0) ||
3222                (bit_size == 16 && value_skips <= 1) ||
3223                (bit_size == 8  && value_skips <= 3));
3224 
3225         /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3226          * constants.
3227          */
3228         uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3229 
3230         /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
3231          * index 0 for Gallium's constant buffer (GL) or push constants
3232          * (Vulkan).
3233          */
3234         if (is_ubo)
3235                 index++;
3236 
3237         /* We can only keep track of the last unifa address we used with
3238          * constant offset loads. If the new load targets the same buffer and
3239          * is close enough to the previous load, we can skip the unifa register
3240          * write by emitting dummy ldunifa instructions to update the unifa
3241          * address.
3242          */
3243         bool skip_unifa = false;
3244         uint32_t ldunifa_skips = 0;
3245         if (dynamic_src) {
3246                 c->current_unifa_block = NULL;
3247         } else if (c->cur_block == c->current_unifa_block &&
3248                    c->current_unifa_is_ubo == !is_ssbo &&
3249                    c->current_unifa_index == index &&
3250                    c->current_unifa_offset <= const_offset &&
3251                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3252                 skip_unifa = true;
3253                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3254         } else {
3255                 c->current_unifa_block = c->cur_block;
3256                 c->current_unifa_is_ubo = !is_ssbo;
3257                 c->current_unifa_index = index;
3258                 c->current_unifa_offset = const_offset;
3259         }
3260 
3261         if (!skip_unifa) {
3262                 struct qreg base_offset = !is_ssbo ?
3263                         vir_uniform(c, QUNIFORM_UBO_ADDR,
3264                                     v3d_unit_data_create(index, const_offset)) :
3265                         vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3266 
3267                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3268                 if (!dynamic_src) {
3269                         if (!is_ssbo) {
3270                                 /* Avoid the extra MOV to UNIFA by making
3271                                  * ldunif load directly into it. We can't
3272                                  * do this if we have not actually emitted
3273                                  * ldunif and are instead reusing a previous
3274                                  * one.
3275                                  */
3276                                 struct qinst *inst =
3277                                         (struct qinst *)c->cur_block->instructions.prev;
3278                                 if (inst == c->defs[base_offset.index]) {
3279                                    inst->dst = unifa;
3280                                    c->defs[base_offset.index] = NULL;
3281                                 } else {
3282                                    vir_MOV_dest(c, unifa, base_offset);
3283                                 }
3284                         } else {
3285                                if (const_offset != 0) {
3286                                         vir_ADD_dest(c, unifa, base_offset,
3287                                                      vir_uniform_ui(c, const_offset));
3288                                } else {
3289                                         vir_MOV_dest(c, unifa, base_offset);
3290                                }
3291                         }
3292                 } else {
3293                         vir_ADD_dest(c, unifa, base_offset,
3294                                      ntq_get_src(c, offset, 0));
3295                 }
3296         } else {
3297                 for (int i = 0; i < ldunifa_skips; i++)
3298                         emit_ldunifa(c, NULL);
3299         }
3300 
3301         uint32_t num_components = nir_intrinsic_dest_components(instr);
3302         for (uint32_t i = 0; i < num_components; ) {
3303                 struct qreg data;
3304                 emit_ldunifa(c, &data);
3305 
3306                 if (bit_size == 32) {
3307                         assert(value_skips == 0);
3308                         ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
3309                         i++;
3310                 } else {
3311                         assert((bit_size == 16 && value_skips <= 1) ||
3312                                (bit_size ==  8 && value_skips <= 3));
3313 
3314                         /* If we have any values to skip, shift to the first
3315                          * valid value in the ldunifa result.
3316                          */
3317                         if (value_skips > 0) {
3318                                 data = vir_SHR(c, data,
3319                                                vir_uniform_ui(c, bit_size *
3320                                                                  value_skips));
3321                         }
3322 
3323                         /* Check how many valid components we have discounting
3324                          * read components to skip.
3325                          */
3326                         uint32_t valid_count = (32 / bit_size) - value_skips;
3327                         assert((bit_size == 16 && valid_count <= 2) ||
3328                                (bit_size ==  8 && valid_count <= 4));
3329                         assert(valid_count > 0);
3330 
3331                         /* Process the valid components */
3332                         do {
3333                                 struct qreg tmp;
3334                                 uint32_t mask = (1 << bit_size) - 1;
3335                                 tmp = vir_AND(c, vir_MOV(c, data),
3336                                               vir_uniform_ui(c, mask));
3337                                 ntq_store_def(c, &instr->def, i,
3338                                               vir_MOV(c, tmp));
3339                                 i++;
3340                                 valid_count--;
3341 
3342                                 /* Shift to next component */
3343                                 if (i < num_components && valid_count > 0) {
3344                                         data = vir_SHR(c, data,
3345                                                        vir_uniform_ui(c, bit_size));
3346                                 }
3347                         } while (i < num_components && valid_count > 0);
3348                 }
3349         }
3350 
3351         return true;
3352 }
3353 
3354 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3355 emit_load_local_invocation_index(struct v3d_compile *c)
3356 {
3357         return vir_SHR(c, c->cs_payload[1],
3358                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3359 }
3360 
3361 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
3362  * fragment shaders a lane is considered active if any sample flags are set
3363  * for *any* lane in the same quad, however, we still need to ensure that
3364  * terminated lanes (OpTerminate) are not included. Further, we also need to
3365  * disable lanes that may be disabled because of non-uniform control
3366  * flow.
3367  */
3368 static enum v3d_qpu_cond
setup_subgroup_control_flow_condition(struct v3d_compile * c)3369 setup_subgroup_control_flow_condition(struct v3d_compile *c)
3370 {
3371         assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
3372                c->s->info.stage == MESA_SHADER_COMPUTE);
3373 
3374         enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
3375 
3376         /* We need to make sure that terminated lanes in fragment shaders are
3377          * not included. We can identify these lanes by comparing the inital
3378          * sample mask with the current. This fixes:
3379          * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
3380          */
3381         if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
3382                 vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
3383                                            vir_NOT(c, vir_XOR(c, c->start_msf,
3384                                                               vir_MSF(c)))),
3385                            V3D_QPU_PF_PUSHZ);
3386                 cond = V3D_QPU_COND_IFNA;
3387         }
3388 
3389         /* If we are in non-uniform control-flow update the condition to
3390          * also limit lanes to those in the current execution mask.
3391          */
3392         if (vir_in_nonuniform_control_flow(c)) {
3393                 if (cond == V3D_QPU_COND_IFNA) {
3394                         vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3395                                    V3D_QPU_UF_NORNZ);
3396                 } else {
3397                         assert(cond == V3D_QPU_COND_NONE);
3398                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3399                                    V3D_QPU_PF_PUSHZ);
3400                 }
3401                 cond = V3D_QPU_COND_IFA;
3402         }
3403 
3404         return cond;
3405 }
3406 
3407 static void
emit_compute_barrier(struct v3d_compile * c)3408 emit_compute_barrier(struct v3d_compile *c)
3409 {
3410         /* Ensure we flag the use of the control barrier. NIR's
3411          * gather info pass usually takes care of this, but that
3412          * requires that we call that pass after any other pass
3413          * may emit a control barrier, so this is safer.
3414          */
3415         c->s->info.uses_control_barrier = true;
3416 
3417         /* Emit a TSY op to get all invocations in the workgroup
3418          * (actually supergroup) to block until the last
3419          * invocation reaches the TSY op.
3420          */
3421         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
3422 }
3423 
3424 static void
emit_barrier(struct v3d_compile * c)3425 emit_barrier(struct v3d_compile *c)
3426 {
3427         struct qreg eidx = vir_EIDX(c);
3428 
3429         /* The config for the TSY op should be setup like this:
3430          * - Lane 0: Quorum
3431          * - Lane 2: TSO id
3432          * - Lane 3: TSY opcode
3433          */
3434 
3435         /* Lane 0: we want to synchronize across one subgroup. Here we write to
3436          * all lanes unconditionally and will overwrite other lanes below.
3437          */
3438         struct qreg tsy_conf = vir_uniform_ui(c, 1);
3439 
3440         /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
3441          * curent QPU index and thread index to ensure we get a unique one for
3442          * this group of invocations in this core.
3443          */
3444         struct qreg tso_id =
3445                 vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
3446         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
3447                    V3D_QPU_PF_PUSHZ);
3448         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
3449 
3450         /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
3451         struct qreg tsy_op = vir_uniform_ui(c, 16);
3452         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
3453                    V3D_QPU_PF_PUSHZ);
3454         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
3455 
3456         /* Emit TSY sync */
3457         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
3458 }
3459 
3460 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3461 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3462 {
3463         switch (instr->intrinsic) {
3464         case nir_intrinsic_decl_reg:
3465         case nir_intrinsic_load_reg:
3466         case nir_intrinsic_store_reg:
3467                 break; /* Ignore these */
3468 
3469         case nir_intrinsic_load_uniform:
3470                 ntq_emit_load_uniform(c, instr);
3471                 break;
3472 
3473         case nir_intrinsic_load_global:
3474         case nir_intrinsic_load_global_constant:
3475                 ntq_emit_tmu_general(c, instr, false, true);
3476                 c->has_general_tmu_load = true;
3477                 break;
3478 
3479         case nir_intrinsic_load_ubo:
3480            if (ntq_emit_inline_ubo_load(c, instr))
3481                    break;
3482            FALLTHROUGH;
3483         case nir_intrinsic_load_ssbo:
3484                 if (!ntq_emit_load_unifa(c, instr)) {
3485                         ntq_emit_tmu_general(c, instr, false, false);
3486                         c->has_general_tmu_load = true;
3487                 }
3488                 break;
3489 
3490         case nir_intrinsic_store_ssbo:
3491         case nir_intrinsic_ssbo_atomic:
3492         case nir_intrinsic_ssbo_atomic_swap:
3493                 ntq_emit_tmu_general(c, instr, false, false);
3494                 break;
3495 
3496         case nir_intrinsic_store_global:
3497         case nir_intrinsic_global_atomic:
3498         case nir_intrinsic_global_atomic_swap:
3499                 ntq_emit_tmu_general(c, instr, false, true);
3500                 break;
3501 
3502         case nir_intrinsic_shared_atomic:
3503         case nir_intrinsic_shared_atomic_swap:
3504         case nir_intrinsic_store_shared:
3505         case nir_intrinsic_store_scratch:
3506                 ntq_emit_tmu_general(c, instr, true, false);
3507                 break;
3508 
3509         case nir_intrinsic_load_scratch:
3510         case nir_intrinsic_load_shared:
3511                 ntq_emit_tmu_general(c, instr, true, false);
3512                 c->has_general_tmu_load = true;
3513                 break;
3514 
3515         case nir_intrinsic_image_store:
3516         case nir_intrinsic_image_atomic:
3517         case nir_intrinsic_image_atomic_swap:
3518                 v3d_vir_emit_image_load_store(c, instr);
3519                 break;
3520 
3521         case nir_intrinsic_image_load:
3522                 v3d_vir_emit_image_load_store(c, instr);
3523                 /* Not really a general TMU load, but we only use this flag
3524                  * for NIR scheduling and we do schedule these under the same
3525                  * policy as general TMU.
3526                  */
3527                 c->has_general_tmu_load = true;
3528                 break;
3529 
3530         case nir_intrinsic_get_ssbo_size:
3531                 ntq_store_def(c, &instr->def, 0,
3532                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3533                                           nir_src_comp_as_uint(instr->src[0], 0)));
3534                 break;
3535 
3536         case nir_intrinsic_get_ubo_size:
3537                 ntq_store_def(c, &instr->def, 0,
3538                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3539                                           nir_src_comp_as_uint(instr->src[0], 0)));
3540                 break;
3541 
3542         case nir_intrinsic_load_user_clip_plane:
3543                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3544                         ntq_store_def(c, &instr->def, i,
3545                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3546                                                   nir_intrinsic_ucp_id(instr) *
3547                                                   4 + i));
3548                 }
3549                 break;
3550 
3551         case nir_intrinsic_load_viewport_x_scale:
3552                 ntq_store_def(c, &instr->def, 0,
3553                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3554                 break;
3555 
3556         case nir_intrinsic_load_viewport_y_scale:
3557                 ntq_store_def(c, &instr->def, 0,
3558                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3559                 break;
3560 
3561         case nir_intrinsic_load_viewport_z_scale:
3562                 ntq_store_def(c, &instr->def, 0,
3563                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3564                 break;
3565 
3566         case nir_intrinsic_load_viewport_z_offset:
3567                 ntq_store_def(c, &instr->def, 0,
3568                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3569                 break;
3570 
3571         case nir_intrinsic_load_line_coord:
3572                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
3573                 break;
3574 
3575         case nir_intrinsic_load_line_width:
3576                 ntq_store_def(c, &instr->def, 0,
3577                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3578                 break;
3579 
3580         case nir_intrinsic_load_aa_line_width:
3581                 ntq_store_def(c, &instr->def, 0,
3582                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3583                 break;
3584 
3585         case nir_intrinsic_load_sample_mask_in:
3586                 ntq_store_def(c, &instr->def, 0, vir_MSF(c));
3587                 break;
3588 
3589         case nir_intrinsic_load_helper_invocation:
3590                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3591                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3592                 ntq_store_def(c, &instr->def, 0, qdest);
3593                 break;
3594 
3595         case nir_intrinsic_load_front_face:
3596                 /* The register contains 0 (front) or 1 (back), and we need to
3597                  * turn it into a NIR bool where true means front.
3598                  */
3599                 ntq_store_def(c, &instr->def, 0,
3600                               vir_ADD(c,
3601                                       vir_uniform_ui(c, -1),
3602                                       vir_REVF(c)));
3603                 break;
3604 
3605         case nir_intrinsic_load_base_instance:
3606                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
3607                 break;
3608 
3609         case nir_intrinsic_load_instance_id:
3610                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
3611                 break;
3612 
3613         case nir_intrinsic_load_vertex_id:
3614                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
3615                 break;
3616 
3617         case nir_intrinsic_load_draw_id:
3618                 ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
3619                 break;
3620 
3621         case nir_intrinsic_load_tlb_color_brcm:
3622                 vir_emit_tlb_color_read(c, instr);
3623                 break;
3624 
3625         case nir_intrinsic_load_fep_w_v3d:
3626                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
3627                 break;
3628 
3629         case nir_intrinsic_load_input:
3630                 ntq_emit_load_input(c, instr);
3631                 break;
3632 
3633         case nir_intrinsic_store_tlb_sample_color_v3d:
3634                ntq_emit_per_sample_color_write(c, instr);
3635                break;
3636 
3637        case nir_intrinsic_store_output:
3638                 ntq_emit_store_output(c, instr);
3639                 break;
3640 
3641         case nir_intrinsic_image_size:
3642                 ntq_emit_image_size(c, instr);
3643                 break;
3644 
3645         /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
3646          * is intended to match the semantics of GLSL's discard) should
3647          * terminate the invocation immediately. Our implementation doesn't
3648          * do that. What we do is actually a demote by removing the invocations
3649          * from the sample mask. Maybe we could be more strict and force an
3650          * early termination by emitting a (maybe conditional) jump to the
3651          * end section of the fragment shader for affected invocations.
3652          */
3653         case nir_intrinsic_terminate:
3654                 c->emitted_discard = true;
3655                 FALLTHROUGH;
3656         case nir_intrinsic_demote:
3657                 ntq_flush_tmu(c);
3658 
3659                 if (vir_in_nonuniform_control_flow(c)) {
3660                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3661                                    V3D_QPU_PF_PUSHZ);
3662                         vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3663                                                      vir_uniform_ui(c, 0)),
3664                                 V3D_QPU_COND_IFA);
3665                 } else {
3666                         vir_SETMSF_dest(c, vir_nop_reg(),
3667                                         vir_uniform_ui(c, 0));
3668                 }
3669                 break;
3670 
3671         case nir_intrinsic_terminate_if:
3672                 c->emitted_discard = true;
3673                 FALLTHROUGH;
3674         case nir_intrinsic_demote_if: {
3675                 ntq_flush_tmu(c);
3676 
3677                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3678 
3679                 if (vir_in_nonuniform_control_flow(c)) {
3680                         struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3681                                                                c->execute);
3682                         if (cond == V3D_QPU_COND_IFA) {
3683                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3684                         } else {
3685                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3686                                 cond = V3D_QPU_COND_IFA;
3687                         }
3688                 }
3689 
3690                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3691                                              vir_uniform_ui(c, 0)), cond);
3692                 break;
3693         }
3694 
3695         case nir_intrinsic_barrier:
3696                 /* Ensure that the TMU operations before the barrier are flushed
3697                  * before the ones after the barrier.
3698                  */
3699                 ntq_flush_tmu(c);
3700 
3701                 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
3702                         if (c->s->info.stage == MESA_SHADER_COMPUTE)
3703                                 emit_compute_barrier(c);
3704                         else
3705                                 emit_barrier(c);
3706 
3707                         /* The blocking of a TSY op only happens at the next
3708                          * thread switch. No texturing may be outstanding at the
3709                          * time of a TSY blocking operation.
3710                          */
3711                         vir_emit_thrsw(c);
3712                 }
3713                 break;
3714 
3715         case nir_intrinsic_load_num_workgroups:
3716                 for (int i = 0; i < 3; i++) {
3717                         ntq_store_def(c, &instr->def, i,
3718                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3719                                                   i));
3720                 }
3721                 break;
3722 
3723         case nir_intrinsic_load_workgroup_id: {
3724                 struct qreg x = vir_AND(c, c->cs_payload[0],
3725                                          vir_uniform_ui(c, 0xffff));
3726                 ntq_store_def(c, &instr->def, 0, x);
3727 
3728                 struct qreg y = vir_SHR(c, c->cs_payload[0],
3729                                          vir_uniform_ui(c, 16));
3730                 ntq_store_def(c, &instr->def, 1, y);
3731 
3732                 struct qreg z = vir_AND(c, c->cs_payload[1],
3733                                          vir_uniform_ui(c, 0xffff));
3734                 ntq_store_def(c, &instr->def, 2, z);
3735                 break;
3736         }
3737 
3738         case nir_intrinsic_load_base_workgroup_id: {
3739                 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
3740                 ntq_store_def(c, &instr->def, 0, x);
3741 
3742                 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
3743                 ntq_store_def(c, &instr->def, 1, y);
3744 
3745                 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
3746                 ntq_store_def(c, &instr->def, 2, z);
3747                 break;
3748         }
3749 
3750         case nir_intrinsic_load_workgroup_size: {
3751                 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 0);
3752                 ntq_store_def(c, &instr->def, 0, x);
3753 
3754                 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 1);
3755                 ntq_store_def(c, &instr->def, 1, y);
3756 
3757                 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 2);
3758                 ntq_store_def(c, &instr->def, 2, z);
3759                 break;
3760         }
3761 
3762         case nir_intrinsic_load_local_invocation_index:
3763                 ntq_store_def(c, &instr->def, 0,
3764                               emit_load_local_invocation_index(c));
3765                 break;
3766 
3767         case nir_intrinsic_load_subgroup_id: {
3768                 /* This is basically the batch index, which is the Local
3769                  * Invocation Index divided by the SIMD width).
3770                  */
3771                 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
3772                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3773                 struct qreg lii = emit_load_local_invocation_index(c);
3774                 ntq_store_def(c, &instr->def, 0,
3775                               vir_SHR(c, lii,
3776                                       vir_uniform_ui(c, divide_shift)));
3777                 break;
3778         }
3779 
3780         case nir_intrinsic_load_per_vertex_input: {
3781                 /* The vertex shader writes all its used outputs into
3782                  * consecutive VPM offsets, so if any output component is
3783                  * unused, its VPM offset is used by the next used
3784                  * component. This means that we can't assume that each
3785                  * location will use 4 consecutive scalar offsets in the VPM
3786                  * and we need to compute the VPM offset for each input by
3787                  * going through the inputs and finding the one that matches
3788                  * our location and component.
3789                  *
3790                  * col: vertex index, row = varying index
3791                  */
3792                 assert(nir_src_is_const(instr->src[1]));
3793                 uint32_t location =
3794                         nir_intrinsic_io_semantics(instr).location +
3795                         nir_src_as_uint(instr->src[1]);
3796                 uint32_t component = nir_intrinsic_component(instr);
3797 
3798                 int32_t row_idx = -1;
3799                 for (int i = 0; i < c->num_inputs; i++) {
3800                         struct v3d_varying_slot slot = c->input_slots[i];
3801                         if (v3d_slot_get_slot(slot) == location &&
3802                             v3d_slot_get_component(slot) == component) {
3803                                 row_idx = i;
3804                                 break;
3805                         }
3806                 }
3807 
3808                 assert(row_idx != -1);
3809 
3810                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3811                 for (int i = 0; i < instr->num_components; i++) {
3812                         struct qreg row = vir_uniform_ui(c, row_idx++);
3813                         ntq_store_def(c, &instr->def, i,
3814                                       vir_LDVPMG_IN(c, row, col));
3815                 }
3816                 break;
3817         }
3818 
3819         case nir_intrinsic_emit_vertex:
3820         case nir_intrinsic_end_primitive:
3821                 unreachable("Should have been lowered in v3d_nir_lower_io");
3822                 break;
3823 
3824         case nir_intrinsic_load_primitive_id: {
3825                 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3826                  * VPM output header. According to docs, we should read this
3827                  * using ldvpm(v,d)_in (See Table 71).
3828                  */
3829                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3830                 ntq_store_def(c, &instr->def, 0,
3831                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3832                 break;
3833         }
3834 
3835         case nir_intrinsic_load_invocation_id:
3836                 ntq_store_def(c, &instr->def, 0, vir_IID(c));
3837                 break;
3838 
3839         case nir_intrinsic_load_fb_layers_v3d:
3840                 ntq_store_def(c, &instr->def, 0,
3841                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3842                 break;
3843 
3844         case nir_intrinsic_load_sample_id:
3845                 ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
3846                 break;
3847 
3848         case nir_intrinsic_load_sample_pos:
3849                 ntq_store_def(c, &instr->def, 0,
3850                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3851                 ntq_store_def(c, &instr->def, 1,
3852                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3853                 break;
3854 
3855         case nir_intrinsic_load_barycentric_at_offset:
3856                 ntq_store_def(c, &instr->def, 0,
3857                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3858                 ntq_store_def(c, &instr->def, 1,
3859                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3860                 break;
3861 
3862         case nir_intrinsic_load_barycentric_pixel:
3863                 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3864                 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3865                 break;
3866 
3867         case nir_intrinsic_load_barycentric_at_sample: {
3868                 if (!c->fs_key->msaa) {
3869                         ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3870                         ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3871                         return;
3872                 }
3873 
3874                 struct qreg offset_x, offset_y;
3875                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3876                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3877 
3878                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3879                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3880                 break;
3881         }
3882 
3883         case nir_intrinsic_load_barycentric_sample: {
3884                 struct qreg offset_x =
3885                         vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3886                 struct qreg offset_y =
3887                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3888 
3889                 ntq_store_def(c, &instr->def, 0,
3890                              vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3891                 ntq_store_def(c, &instr->def, 1,
3892                               vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3893                 break;
3894         }
3895 
3896         case nir_intrinsic_load_barycentric_centroid: {
3897                 struct qreg offset_x, offset_y;
3898                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3899                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3900                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3901                 break;
3902         }
3903 
3904         case nir_intrinsic_load_interpolated_input: {
3905                 assert(nir_src_is_const(instr->src[1]));
3906                 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3907 
3908                 for (int i = 0; i < instr->num_components; i++) {
3909                         const uint32_t input_idx =
3910                                 (nir_intrinsic_base(instr) + offset) * 4 +
3911                                 nir_intrinsic_component(instr) + i;
3912 
3913                         /* If we are not in MSAA or if we are not interpolating
3914                          * a user varying, just return the pre-computed
3915                          * interpolated input.
3916                          */
3917                         if (!c->fs_key->msaa ||
3918                             c->interp[input_idx].vp.file == QFILE_NULL) {
3919                                 ntq_store_def(c, &instr->def, i,
3920                                               vir_MOV(c, c->inputs[input_idx]));
3921                                 continue;
3922                         }
3923 
3924                         /* Otherwise compute interpolation at the specified
3925                          * offset.
3926                          */
3927                         struct qreg p = c->interp[input_idx].vp;
3928                         struct qreg C = c->interp[input_idx].C;
3929                         unsigned interp_mode =  c->interp[input_idx].mode;
3930 
3931                         struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3932                         struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3933 
3934                         struct qreg result =
3935                               ntq_emit_load_interpolated_input(c, p, C,
3936                                                                offset_x, offset_y,
3937                                                                interp_mode);
3938                         ntq_store_def(c, &instr->def, i, result);
3939                 }
3940                 break;
3941         }
3942 
3943         case nir_intrinsic_load_subgroup_size:
3944                 ntq_store_def(c, &instr->def, 0,
3945                               vir_uniform_ui(c, V3D_CHANNELS));
3946                 break;
3947 
3948         case nir_intrinsic_load_subgroup_invocation:
3949                 ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
3950                 break;
3951 
3952         case nir_intrinsic_ddx:
3953         case nir_intrinsic_ddx_coarse:
3954         case nir_intrinsic_ddx_fine: {
3955                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3956                 ntq_store_def(c, &instr->def, 0, vir_FDX(c, value));
3957                 break;
3958         }
3959 
3960         case nir_intrinsic_ddy:
3961         case nir_intrinsic_ddy_coarse:
3962         case nir_intrinsic_ddy_fine: {
3963                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3964                 ntq_store_def(c, &instr->def, 0, vir_FDY(c, value));
3965                 break;
3966         }
3967 
3968         case nir_intrinsic_elect: {
3969                 struct qreg first;
3970                 if (vir_in_nonuniform_control_flow(c)) {
3971                         /* Sets A=1 for lanes enabled in the execution mask */
3972                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3973                                    V3D_QPU_PF_PUSHZ);
3974                         /* Updates A ANDing with lanes enabled in MSF */
3975                         vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
3976                                    V3D_QPU_UF_ANDNZ);
3977                         first = vir_FLAFIRST(c);
3978                 } else {
3979                         /* Sets A=1 for inactive lanes */
3980                         vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
3981                                    V3D_QPU_PF_PUSHZ);
3982                         first = vir_FLNAFIRST(c);
3983                 }
3984 
3985                 /* Produce a boolean result */
3986                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3987                                            first, vir_uniform_ui(c, 1)),
3988                                            V3D_QPU_PF_PUSHZ);
3989                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3990                 ntq_store_def(c, &instr->def, 0, result);
3991                 break;
3992         }
3993 
3994         case nir_intrinsic_ballot: {
3995                 assert(c->devinfo->ver >= 71);
3996                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3997                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3998                 struct qreg res = vir_get_temp(c);
3999                 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
4000                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4001                 break;
4002         }
4003 
4004         case nir_intrinsic_read_invocation: {
4005                 assert(c->devinfo->ver >= 71);
4006                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4007                 struct qreg index = ntq_get_src(c, instr->src[1], 0);
4008                 struct qreg res = vir_SHUFFLE(c, value, index);
4009                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4010                 break;
4011         }
4012 
4013         case nir_intrinsic_read_first_invocation: {
4014                 assert(c->devinfo->ver >= 71);
4015                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4016                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4017                 struct qreg res = vir_get_temp(c);
4018                 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
4019                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4020                 break;
4021         }
4022 
4023         case nir_intrinsic_shuffle: {
4024                 assert(c->devinfo->ver >= 71);
4025                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4026                 struct qreg indices = ntq_get_src(c, instr->src[1], 0);
4027                 struct qreg res = vir_SHUFFLE(c, value, indices);
4028                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4029                 break;
4030         }
4031 
4032         case nir_intrinsic_vote_feq:
4033         case nir_intrinsic_vote_ieq: {
4034                 assert(c->devinfo->ver >= 71);
4035                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4036                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4037                 struct qreg res = vir_get_temp(c);
4038                 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
4039                              vir_ALLEQ_dest(c, res, value) :
4040                              vir_ALLFEQ_dest(c, res, value),
4041                              cond);
4042 
4043                 /* Produce boolean result */
4044                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4045                            V3D_QPU_PF_PUSHZ);
4046                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4047                 ntq_store_def(c, &instr->def, 0, result);
4048                 break;
4049         }
4050 
4051         case nir_intrinsic_vote_all: {
4052                 assert(c->devinfo->ver >= 71);
4053                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4054                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4055                 struct qreg res = vir_get_temp(c);
4056                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4057 
4058                 /* We want to check if 'all lanes are equal (alleq != 0) and
4059                  * their value is True (value != 0)'.
4060                  *
4061                  * The first MOV.pushz generates predicate for 'alleq == 0'.
4062                  * The second MOV.NORZ generates predicate for:
4063                  * '!(alleq == 0) & !(value == 0).
4064                  */
4065                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4066                            V3D_QPU_PF_PUSHZ);
4067                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4068                            V3D_QPU_UF_NORZ);
4069                 struct qreg result =
4070                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
4071                 ntq_store_def(c, &instr->def, 0, result);
4072                 break;
4073         }
4074 
4075         case nir_intrinsic_vote_any: {
4076                 assert(c->devinfo->ver >= 71);
4077                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4078                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4079                 struct qreg res = vir_get_temp(c);
4080                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4081 
4082                 /* We want to check 'not (all lanes are equal (alleq != 0)'
4083                  * and their value is False (value == 0))'.
4084                  *
4085                  * The first MOV.pushz generates predicate for 'alleq == 0'.
4086                  * The second MOV.NORNZ generates predicate for:
4087                  * '!(alleq == 0) & (value == 0).
4088                  * The IFNA condition negates the predicate when evaluated:
4089                  * '!(!alleq == 0) & (value == 0))
4090                  */
4091                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4092                            V3D_QPU_PF_PUSHZ);
4093                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4094                            V3D_QPU_UF_NORNZ);
4095                 struct qreg result =
4096                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4097                 ntq_store_def(c, &instr->def, 0, result);
4098                 break;
4099         }
4100 
4101         case nir_intrinsic_load_num_subgroups:
4102                 unreachable("Should have been lowered");
4103                 break;
4104 
4105         case nir_intrinsic_load_view_index:
4106                 ntq_store_def(c, &instr->def, 0,
4107                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
4108                 break;
4109 
4110         default:
4111                 fprintf(stderr, "Unknown intrinsic: ");
4112                 nir_print_instr(&instr->instr, stderr);
4113                 fprintf(stderr, "\n");
4114                 abort();
4115         }
4116 }
4117 
4118 /* Clears (activates) the execute flags for any channels whose jump target
4119  * matches this block.
4120  *
4121  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
4122  * enabling?
4123  *
4124  */
4125 static void
ntq_activate_execute_for_block(struct v3d_compile * c)4126 ntq_activate_execute_for_block(struct v3d_compile *c)
4127 {
4128         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4129                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
4130                    V3D_QPU_PF_PUSHZ);
4131 
4132         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4133 }
4134 
4135 static bool
is_cheap_block(nir_block * block)4136 is_cheap_block(nir_block *block)
4137 {
4138         int32_t cost = 3;
4139         nir_foreach_instr(instr, block) {
4140                 switch (instr->type) {
4141                 case nir_instr_type_alu:
4142                 case nir_instr_type_undef:
4143                 case nir_instr_type_load_const:
4144                         if (--cost <= 0)
4145                                 return false;
4146                 break;
4147                 case nir_instr_type_intrinsic: {
4148                         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4149                         switch (intr->intrinsic) {
4150                         case nir_intrinsic_decl_reg:
4151                         case nir_intrinsic_load_reg:
4152                         case nir_intrinsic_store_reg:
4153                                 continue;
4154                         default:
4155                                 return false;
4156                         }
4157                 }
4158                 default:
4159                         return false;
4160                 }
4161         }
4162         return true;
4163 }
4164 
4165 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)4166 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
4167 {
4168         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4169         bool empty_else_block =
4170                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4171                  exec_list_is_empty(&nir_else_block->instr_list));
4172 
4173         struct qblock *then_block = vir_new_block(c);
4174         struct qblock *after_block = vir_new_block(c);
4175         struct qblock *else_block;
4176         if (empty_else_block)
4177                 else_block = after_block;
4178         else
4179                 else_block = vir_new_block(c);
4180 
4181         /* Check if this if statement is really just a conditional jump with
4182          * the form:
4183          *
4184          * if (cond) {
4185          *    break/continue;
4186          * } else {
4187          * }
4188          *
4189          * In which case we can skip the jump to ELSE we emit before the THEN
4190          * block and instead just emit the break/continue directly.
4191          */
4192         nir_jump_instr *conditional_jump = NULL;
4193         if (empty_else_block) {
4194                 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
4195                 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
4196                 if (inst && inst->type == nir_instr_type_jump)
4197                         conditional_jump = nir_instr_as_jump(inst);
4198         }
4199 
4200         /* Set up the flags for the IF condition (taking the THEN branch). */
4201         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4202 
4203         if (!conditional_jump) {
4204                 /* Jump to ELSE. */
4205                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4206                            V3D_QPU_BRANCH_COND_ANYNA :
4207                            V3D_QPU_BRANCH_COND_ANYA);
4208                 /* Pixels that were not dispatched or have been discarded
4209                  * should not contribute to the ANYA/ANYNA condition.
4210                  */
4211                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4212 
4213                 vir_link_blocks(c->cur_block, else_block);
4214                 vir_link_blocks(c->cur_block, then_block);
4215 
4216                 /* Process the THEN block. */
4217                 vir_set_emit_block(c, then_block);
4218                 ntq_emit_cf_list(c, &if_stmt->then_list);
4219 
4220                 if (!empty_else_block) {
4221                         /* At the end of the THEN block, jump to ENDIF, unless
4222                          * the block ended in a break or continue.
4223                          */
4224                         if (!c->cur_block->branch_emitted) {
4225                                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4226                                 vir_link_blocks(c->cur_block, after_block);
4227                         }
4228 
4229                         /* Emit the else block. */
4230                         vir_set_emit_block(c, else_block);
4231                         ntq_emit_cf_list(c, &if_stmt->else_list);
4232                 }
4233         } else {
4234                 /* Emit the conditional jump directly.
4235                  *
4236                  * Use ALL with breaks and ANY with continues to ensure that
4237                  * we always break and never continue when all lanes have been
4238                  * disabled (for example because of discards) to prevent
4239                  * infinite loops.
4240                  */
4241                 assert(conditional_jump &&
4242                        (conditional_jump->type == nir_jump_continue ||
4243                         conditional_jump->type == nir_jump_break));
4244 
4245                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4246                            (conditional_jump->type == nir_jump_break ?
4247                             V3D_QPU_BRANCH_COND_ALLA :
4248                             V3D_QPU_BRANCH_COND_ANYA) :
4249                            (conditional_jump->type == nir_jump_break ?
4250                             V3D_QPU_BRANCH_COND_ALLNA :
4251                             V3D_QPU_BRANCH_COND_ANYNA));
4252                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4253 
4254                 vir_link_blocks(c->cur_block,
4255                                 conditional_jump->type == nir_jump_break ?
4256                                         c->loop_break_block :
4257                                         c->loop_cont_block);
4258         }
4259 
4260         vir_link_blocks(c->cur_block, after_block);
4261 
4262         vir_set_emit_block(c, after_block);
4263 }
4264 
4265 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)4266 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
4267 {
4268         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4269         bool empty_else_block =
4270                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4271                  exec_list_is_empty(&nir_else_block->instr_list));
4272 
4273         struct qblock *then_block = vir_new_block(c);
4274         struct qblock *after_block = vir_new_block(c);
4275         struct qblock *else_block;
4276         if (empty_else_block)
4277                 else_block = after_block;
4278         else
4279                 else_block = vir_new_block(c);
4280 
4281         bool was_uniform_control_flow = false;
4282         if (!vir_in_nonuniform_control_flow(c)) {
4283                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4284                 was_uniform_control_flow = true;
4285         }
4286 
4287         /* Set up the flags for the IF condition (taking the THEN branch). */
4288         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4289 
4290         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
4291          * was previously active (execute Z) for updating the exec flags.
4292          */
4293         if (was_uniform_control_flow) {
4294                 cond = v3d_qpu_cond_invert(cond);
4295         } else {
4296                 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
4297                 if (cond == V3D_QPU_COND_IFA) {
4298                         vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
4299                 } else {
4300                         vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
4301                         cond = V3D_QPU_COND_IFA;
4302                 }
4303         }
4304 
4305         vir_MOV_cond(c, cond,
4306                      c->execute,
4307                      vir_uniform_ui(c, else_block->index));
4308 
4309         /* Set the flags for taking the THEN block */
4310         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4311                    V3D_QPU_PF_PUSHZ);
4312 
4313         /* Jump to ELSE if nothing is active for THEN (unless THEN block is
4314          * so small it won't pay off), otherwise fall through.
4315          */
4316         bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
4317                         is_cheap_block(nir_if_first_then_block(if_stmt));
4318         if (!is_cheap) {
4319                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
4320                 vir_link_blocks(c->cur_block, else_block);
4321         }
4322         vir_link_blocks(c->cur_block, then_block);
4323 
4324         /* Process the THEN block.
4325          *
4326          * Notice we don't call ntq_activate_execute_for_block here on purpose:
4327          * c->execute is already set up to be 0 for lanes that must take the
4328          * THEN block.
4329          */
4330         vir_set_emit_block(c, then_block);
4331         ntq_emit_cf_list(c, &if_stmt->then_list);
4332 
4333         if (!empty_else_block) {
4334                 /* Handle the end of the THEN block.  First, all currently
4335                  * active channels update their execute flags to point to
4336                  * ENDIF
4337                  */
4338                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4339                            V3D_QPU_PF_PUSHZ);
4340                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4341                              vir_uniform_ui(c, after_block->index));
4342 
4343                 /* If everything points at ENDIF, then jump there immediately
4344                  * (unless ELSE block is so small it won't pay off).
4345                  */
4346                 bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
4347                                 is_cheap_block(nir_else_block);
4348                 if (!is_cheap) {
4349                         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4350                                                    c->execute,
4351                                                    vir_uniform_ui(c, after_block->index)),
4352                                    V3D_QPU_PF_PUSHZ);
4353                         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
4354                         vir_link_blocks(c->cur_block, after_block);
4355                 }
4356                 vir_link_blocks(c->cur_block, else_block);
4357 
4358                 vir_set_emit_block(c, else_block);
4359                 ntq_activate_execute_for_block(c);
4360                 ntq_emit_cf_list(c, &if_stmt->else_list);
4361         }
4362 
4363         vir_link_blocks(c->cur_block, after_block);
4364 
4365         vir_set_emit_block(c, after_block);
4366         if (was_uniform_control_flow)
4367                 c->execute = c->undef;
4368         else
4369                 ntq_activate_execute_for_block(c);
4370 }
4371 
4372 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)4373 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
4374 {
4375         bool was_in_control_flow = c->in_control_flow;
4376         c->in_control_flow = true;
4377         if (!vir_in_nonuniform_control_flow(c) &&
4378             !nir_src_is_divergent(&nif->condition)) {
4379                 ntq_emit_uniform_if(c, nif);
4380         } else {
4381                 ntq_emit_nonuniform_if(c, nif);
4382         }
4383         c->in_control_flow = was_in_control_flow;
4384 }
4385 
4386 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4387 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4388 {
4389         switch (jump->type) {
4390         case nir_jump_break:
4391                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4392                            V3D_QPU_PF_PUSHZ);
4393                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4394                              vir_uniform_ui(c, c->loop_break_block->index));
4395                 break;
4396 
4397         case nir_jump_continue:
4398                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4399                            V3D_QPU_PF_PUSHZ);
4400                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4401                              vir_uniform_ui(c, c->loop_cont_block->index));
4402                 break;
4403 
4404         case nir_jump_return:
4405                 unreachable("All returns should be lowered\n");
4406                 break;
4407 
4408         case nir_jump_halt:
4409         case nir_jump_goto:
4410         case nir_jump_goto_if:
4411                 unreachable("not supported\n");
4412                 break;
4413         }
4414 }
4415 
4416 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4417 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4418 {
4419         switch (jump->type) {
4420         case nir_jump_break:
4421                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4422                 vir_link_blocks(c->cur_block, c->loop_break_block);
4423                 c->cur_block->branch_emitted = true;
4424                 break;
4425         case nir_jump_continue:
4426                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4427                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4428                 c->cur_block->branch_emitted = true;
4429                 break;
4430 
4431         case nir_jump_return:
4432                 unreachable("All returns should be lowered\n");
4433                 break;
4434 
4435         case nir_jump_halt:
4436         case nir_jump_goto:
4437         case nir_jump_goto_if:
4438                 unreachable("not supported\n");
4439                 break;
4440         }
4441 }
4442 
4443 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4444 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4445 {
4446         switch (instr->type) {
4447         case nir_instr_type_alu:
4448                 ntq_emit_alu(c, nir_instr_as_alu(instr));
4449                 break;
4450 
4451         case nir_instr_type_intrinsic:
4452                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4453                 break;
4454 
4455         case nir_instr_type_load_const:
4456                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4457                 break;
4458 
4459         case nir_instr_type_undef:
4460                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4461                 break;
4462 
4463         case nir_instr_type_tex:
4464                 ntq_emit_tex(c, nir_instr_as_tex(instr));
4465                 break;
4466 
4467         case nir_instr_type_jump:
4468                 /* Always flush TMU before jumping to another block, for the
4469                  * same reasons as in ntq_emit_block.
4470                  */
4471                 ntq_flush_tmu(c);
4472                 if (vir_in_nonuniform_control_flow(c))
4473                         ntq_emit_jump(c, nir_instr_as_jump(instr));
4474                 else
4475                         ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4476                 break;
4477 
4478         default:
4479                 fprintf(stderr, "Unknown NIR instr type: ");
4480                 nir_print_instr(instr, stderr);
4481                 fprintf(stderr, "\n");
4482                 abort();
4483         }
4484 }
4485 
4486 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4487 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4488 {
4489         nir_foreach_instr(instr, block) {
4490                 ntq_emit_instr(c, instr);
4491         }
4492 
4493         /* Always process pending TMU operations in the same block they were
4494          * emitted: we can't emit TMU operations in a block and then emit a
4495          * thread switch and LDTMU/TMUWT for them in another block, possibly
4496          * under control flow.
4497          */
4498         ntq_flush_tmu(c);
4499 }
4500 
4501 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4502 
4503 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4504 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4505 {
4506         bool was_uniform_control_flow = false;
4507         if (!vir_in_nonuniform_control_flow(c)) {
4508                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4509                 was_uniform_control_flow = true;
4510         }
4511 
4512         c->loop_cont_block = vir_new_block(c);
4513         c->loop_break_block = vir_new_block(c);
4514 
4515         vir_link_blocks(c->cur_block, c->loop_cont_block);
4516         vir_set_emit_block(c, c->loop_cont_block);
4517         ntq_activate_execute_for_block(c);
4518 
4519         ntq_emit_cf_list(c, &loop->body);
4520 
4521         /* Re-enable any previous continues now, so our ANYA check below
4522          * works.
4523          *
4524          * XXX: Use the .ORZ flags update, instead.
4525          */
4526         vir_set_pf(c, vir_XOR_dest(c,
4527                                 vir_nop_reg(),
4528                                 c->execute,
4529                                 vir_uniform_ui(c, c->loop_cont_block->index)),
4530                    V3D_QPU_PF_PUSHZ);
4531         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4532 
4533         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4534 
4535         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4536         /* Pixels that were not dispatched or have been discarded should not
4537          * contribute to looping again.
4538          */
4539         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4540         vir_link_blocks(c->cur_block, c->loop_cont_block);
4541         vir_link_blocks(c->cur_block, c->loop_break_block);
4542 
4543         vir_set_emit_block(c, c->loop_break_block);
4544         if (was_uniform_control_flow)
4545                 c->execute = c->undef;
4546         else
4547                 ntq_activate_execute_for_block(c);
4548 }
4549 
4550 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4551 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4552 {
4553         c->loop_cont_block = vir_new_block(c);
4554         c->loop_break_block = vir_new_block(c);
4555 
4556         vir_link_blocks(c->cur_block, c->loop_cont_block);
4557         vir_set_emit_block(c, c->loop_cont_block);
4558 
4559         ntq_emit_cf_list(c, &loop->body);
4560 
4561         if (!c->cur_block->branch_emitted) {
4562                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4563                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4564         }
4565 
4566         vir_set_emit_block(c, c->loop_break_block);
4567 }
4568 
4569 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4570 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4571 {
4572         assert(!nir_loop_has_continue_construct(loop));
4573 
4574         /* Disable flags optimization for loop conditions. The problem here is
4575          * that we can have code like this:
4576          *
4577          *  // block_0
4578          *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
4579          *  loop {
4580          *     // block_1
4581          *     if ssa_9 {
4582          *
4583          * In this example we emit flags to compute ssa_9 and the optimization
4584          * will skip regenerating them again for the loop condition in the
4585          * loop continue block (block_1). However, this is not safe after the
4586          * first iteration because the loop body can stomp the flags if it has
4587          * any conditionals.
4588          */
4589         c->flags_temp = -1;
4590 
4591         bool was_in_control_flow = c->in_control_flow;
4592         c->in_control_flow = true;
4593 
4594         struct qblock *save_loop_cont_block = c->loop_cont_block;
4595         struct qblock *save_loop_break_block = c->loop_break_block;
4596 
4597         if (vir_in_nonuniform_control_flow(c) || nir_loop_is_divergent(loop)) {
4598                 ntq_emit_nonuniform_loop(c, loop);
4599         } else {
4600                 ntq_emit_uniform_loop(c, loop);
4601         }
4602 
4603         c->loop_break_block = save_loop_break_block;
4604         c->loop_cont_block = save_loop_cont_block;
4605 
4606         c->loops++;
4607 
4608         c->in_control_flow = was_in_control_flow;
4609 }
4610 
4611 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4612 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4613 {
4614         fprintf(stderr, "FUNCTIONS not handled.\n");
4615         abort();
4616 }
4617 
4618 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4619 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4620 {
4621         foreach_list_typed(nir_cf_node, node, node, list) {
4622                 switch (node->type) {
4623                 case nir_cf_node_block:
4624                         ntq_emit_block(c, nir_cf_node_as_block(node));
4625                         break;
4626 
4627                 case nir_cf_node_if:
4628                         ntq_emit_if(c, nir_cf_node_as_if(node));
4629                         break;
4630 
4631                 case nir_cf_node_loop:
4632                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
4633                         break;
4634 
4635                 case nir_cf_node_function:
4636                         ntq_emit_function(c, nir_cf_node_as_function(node));
4637                         break;
4638 
4639                 default:
4640                         fprintf(stderr, "Unknown NIR node type\n");
4641                         abort();
4642                 }
4643         }
4644 }
4645 
4646 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4647 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4648 {
4649         ntq_setup_registers(c, impl);
4650         ntq_emit_cf_list(c, &impl->body);
4651 }
4652 
4653 static bool
vir_inst_reads_reg(struct qinst * inst,struct qreg r)4654 vir_inst_reads_reg(struct qinst *inst, struct qreg r)
4655 {
4656    for (int i = 0; i < vir_get_nsrc(inst); i++) {
4657            if (inst->src[i].file == r.file && inst->src[i].index == r.index)
4658                    return true;
4659    }
4660    return false;
4661 }
4662 
4663 static void
sched_flags_in_block(struct v3d_compile * c,struct qblock * block)4664 sched_flags_in_block(struct v3d_compile *c, struct qblock *block)
4665 {
4666         struct qinst *flags_inst = NULL;
4667         list_for_each_entry_safe_rev(struct qinst, inst, &block->instructions, link) {
4668                 /* Check for cases that would prevent us from moving a flags
4669                  * instruction any earlier than this instruction:
4670                  *
4671                  * - The flags instruction reads the result of this instr.
4672                  * - The instruction reads or writes flags.
4673                  */
4674                 if (flags_inst) {
4675                         if (vir_inst_reads_reg(flags_inst, inst->dst) ||
4676                             v3d_qpu_writes_flags(&inst->qpu) ||
4677                             v3d_qpu_reads_flags(&inst->qpu)) {
4678                                 list_move_to(&flags_inst->link, &inst->link);
4679                                 flags_inst = NULL;
4680                         }
4681                 }
4682 
4683                 /* Skip if this instruction does more than just write flags */
4684                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
4685                     inst->dst.file != QFILE_NULL ||
4686                     !v3d_qpu_writes_flags(&inst->qpu)) {
4687                         continue;
4688                 }
4689 
4690                 /* If we already had a flags_inst we should've moved it after
4691                  * this instruction in the if (flags_inst) above.
4692                  */
4693                 assert(!flags_inst);
4694                 flags_inst = inst;
4695         }
4696 
4697         /* If we reached the beginning of the block and we still have a flags
4698          * instruction selected we can put it at the top of the block.
4699          */
4700         if (flags_inst) {
4701                 list_move_to(&flags_inst->link, &block->instructions);
4702                 flags_inst = NULL;
4703         }
4704 }
4705 
4706 /**
4707  * The purpose of this pass is to emit instructions that are only concerned
4708  * with producing flags as early as possible to hopefully reduce liveness
4709  * of their source arguments.
4710  */
4711 static void
sched_flags(struct v3d_compile * c)4712 sched_flags(struct v3d_compile *c)
4713 {
4714         vir_for_each_block(block, c)
4715                 sched_flags_in_block(c, block);
4716 }
4717 
4718 static void
nir_to_vir(struct v3d_compile * c)4719 nir_to_vir(struct v3d_compile *c)
4720 {
4721         switch (c->s->info.stage) {
4722         case MESA_SHADER_FRAGMENT:
4723                 c->start_msf = vir_MSF(c);
4724                 if (c->devinfo->ver < 71)
4725                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4726                 else
4727                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
4728 
4729                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4730                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4731 
4732                 /* V3D 4.x can disable implicit varyings if they are not used */
4733                 c->fs_uses_primitive_id =
4734                         nir_find_variable_with_location(c->s, nir_var_shader_in,
4735                                                         VARYING_SLOT_PRIMITIVE_ID);
4736                 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4737                        c->primitive_id =
4738                                emit_fragment_varying(c, NULL, -1, 0, 0);
4739                 }
4740 
4741                 if (c->fs_key->is_points && program_reads_point_coord(c)) {
4742                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4743                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4744                         c->uses_implicit_point_line_varyings = true;
4745                 } else if (c->fs_key->is_lines &&
4746                            (BITSET_TEST(c->s->info.system_values_read,
4747                                         SYSTEM_VALUE_LINE_COORD))) {
4748                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4749                         c->uses_implicit_point_line_varyings = true;
4750                 }
4751                 break;
4752         case MESA_SHADER_COMPUTE:
4753                 /* Set up the TSO for barriers, assuming we do some. */
4754                 if (c->devinfo->ver < 42) {
4755                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4756                                                       V3D_QPU_WADDR_SYNC));
4757                 }
4758 
4759                 if (c->devinfo->ver == 42) {
4760                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4761                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4762                 } else if (c->devinfo->ver >= 71) {
4763                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
4764                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4765                 }
4766 
4767                 /* Set up the division between gl_LocalInvocationIndex and
4768                  * wg_in_mem in the payload reg.
4769                  */
4770                 int wg_size = (c->s->info.workgroup_size[0] *
4771                                c->s->info.workgroup_size[1] *
4772                                c->s->info.workgroup_size[2]);
4773                 c->local_invocation_index_bits =
4774                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4775                 assert(c->local_invocation_index_bits <= 8);
4776 
4777                 if (c->s->info.shared_size || c->s->info.cs.has_variable_shared_mem) {
4778                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4779                                                         vir_uniform_ui(c, 16));
4780                         if (c->s->info.workgroup_size[0] != 1 ||
4781                             c->s->info.workgroup_size[1] != 1 ||
4782                             c->s->info.workgroup_size[2] != 1) {
4783                                 int wg_bits = (16 -
4784                                                c->local_invocation_index_bits);
4785                                 int wg_mask = (1 << wg_bits) - 1;
4786                                 wg_in_mem = vir_AND(c, wg_in_mem,
4787                                                     vir_uniform_ui(c, wg_mask));
4788                         }
4789 
4790                         struct qreg shared_per_wg;
4791                         if (c->s->info.cs.has_variable_shared_mem) {
4792                                 shared_per_wg = vir_uniform(c, QUNIFORM_SHARED_SIZE, 0);
4793                         } else {
4794                                 shared_per_wg = vir_uniform_ui(c, c->s->info.shared_size);
4795                         }
4796 
4797                         c->cs_shared_offset =
4798                                 vir_ADD(c,
4799                                         vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4800                                         vir_UMUL(c, wg_in_mem, shared_per_wg));
4801                 }
4802                 break;
4803         default:
4804                 break;
4805         }
4806 
4807         if (c->s->scratch_size) {
4808                 v3d_setup_spill_base(c);
4809                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4810         }
4811 
4812         switch (c->s->info.stage) {
4813         case MESA_SHADER_VERTEX:
4814                 ntq_setup_vs_inputs(c);
4815                 break;
4816         case MESA_SHADER_GEOMETRY:
4817                 ntq_setup_gs_inputs(c);
4818                 break;
4819         case MESA_SHADER_FRAGMENT:
4820                 ntq_setup_fs_inputs(c);
4821                 break;
4822         case MESA_SHADER_COMPUTE:
4823                 break;
4824         default:
4825                 unreachable("unsupported shader stage");
4826         }
4827 
4828         ntq_setup_outputs(c);
4829 
4830         /* Find the main function and emit the body. */
4831         nir_foreach_function(function, c->s) {
4832                 assert(function->is_entrypoint);
4833                 assert(function->impl);
4834                 ntq_emit_impl(c, function->impl);
4835         }
4836 }
4837 
4838 /**
4839  * When demoting a shader down to single-threaded, removes the THRSW
4840  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4841  * program end).
4842  */
4843 static void
vir_remove_thrsw(struct v3d_compile * c)4844 vir_remove_thrsw(struct v3d_compile *c)
4845 {
4846         vir_for_each_block(block, c) {
4847                 vir_for_each_inst_safe(inst, block) {
4848                         if (inst->qpu.sig.thrsw)
4849                                 vir_remove_instruction(c, inst);
4850                 }
4851         }
4852 
4853         c->last_thrsw = NULL;
4854 }
4855 
4856 /**
4857  * This makes sure we have a top-level last thread switch which signals the
4858  * start of the last thread section, which may include adding a new thrsw
4859  * instruction if needed. We don't allow spilling in the last thread section, so
4860  * if we need to do any spills that inject additional thread switches later on,
4861  * we ensure this thread switch will still be the last thread switch in the
4862  * program, which makes last thread switch signalling a lot easier when we have
4863  * spilling. If in the end we don't need to spill to compile the program and we
4864  * injected a new thread switch instruction here only for that, we will
4865  * eventually restore the previous last thread switch and remove the one we
4866  * added here.
4867  */
4868 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4869 vir_emit_last_thrsw(struct v3d_compile *c,
4870                     struct qinst **restore_last_thrsw,
4871                     bool *restore_scoreboard_lock)
4872 {
4873         *restore_last_thrsw = c->last_thrsw;
4874 
4875         /* If we're threaded and the last THRSW was in conditional code, then
4876          * we need to emit another one so that we can flag it as the last
4877          * thrsw.
4878          */
4879         if (c->last_thrsw && !c->last_thrsw_at_top_level)
4880                 vir_emit_thrsw(c);
4881 
4882         /* If we're threaded, then we need to mark the last THRSW instruction
4883          * so we can emit a pair of them at QPU emit time.
4884          *
4885          * For V3D 4.x, we can spawn the non-fragment shaders already in the
4886          * post-last-THRSW state, so we can skip this.
4887          */
4888         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
4889                 vir_emit_thrsw(c);
4890 
4891         /* If we have not inserted a last thread switch yet, do it now to ensure
4892          * any potential spilling we do happens before this. If we don't spill
4893          * in the end, we will restore the previous one.
4894          */
4895         if (*restore_last_thrsw == c->last_thrsw) {
4896                 if (*restore_last_thrsw)
4897                         (*restore_last_thrsw)->is_last_thrsw = false;
4898                 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4899                 vir_emit_thrsw(c);
4900         } else {
4901                 *restore_last_thrsw = c->last_thrsw;
4902         }
4903 
4904         assert(c->last_thrsw);
4905         c->last_thrsw->is_last_thrsw = true;
4906 }
4907 
4908 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4909 vir_restore_last_thrsw(struct v3d_compile *c,
4910                        struct qinst *thrsw,
4911                        bool scoreboard_lock)
4912 {
4913         assert(c->last_thrsw);
4914         vir_remove_instruction(c, c->last_thrsw);
4915         c->last_thrsw = thrsw;
4916         if (c->last_thrsw)
4917                 c->last_thrsw->is_last_thrsw = true;
4918         c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4919 }
4920 
4921 /* There's a flag in the shader for "center W is needed for reasons other than
4922  * non-centroid varyings", so we just walk the program after VIR optimization
4923  * to see if it's used.  It should be harmless to set even if we only use
4924  * center W for varyings.
4925  */
4926 static void
vir_check_payload_w(struct v3d_compile * c)4927 vir_check_payload_w(struct v3d_compile *c)
4928 {
4929         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4930                 return;
4931 
4932         vir_for_each_inst_inorder(inst, c) {
4933                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4934                         if (inst->src[i].file == c->payload_w.file &&
4935                             inst->src[i].index == c->payload_w.index) {
4936                                 c->uses_center_w = true;
4937                                 return;
4938                         }
4939                 }
4940         }
4941 }
4942 
4943 void
v3d_nir_to_vir(struct v3d_compile * c)4944 v3d_nir_to_vir(struct v3d_compile *c)
4945 {
4946         if (V3D_DBG(NIR) ||
4947             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4948                 fprintf(stderr, "%s prog %d/%d NIR:\n",
4949                         vir_get_stage_name(c),
4950                         c->program_id, c->variant_id);
4951                 nir_print_shader(c->s, stderr);
4952         }
4953 
4954         nir_to_vir(c);
4955 
4956         bool restore_scoreboard_lock = false;
4957         struct qinst *restore_last_thrsw;
4958 
4959         /* Emit the last THRSW before STVPM and TLB writes. */
4960         vir_emit_last_thrsw(c,
4961                             &restore_last_thrsw,
4962                             &restore_scoreboard_lock);
4963 
4964 
4965         switch (c->s->info.stage) {
4966         case MESA_SHADER_FRAGMENT:
4967                 emit_frag_end(c);
4968                 break;
4969         case MESA_SHADER_GEOMETRY:
4970                 emit_geom_end(c);
4971                 break;
4972         case MESA_SHADER_VERTEX:
4973                 emit_vert_end(c);
4974                 break;
4975         case MESA_SHADER_COMPUTE:
4976                 break;
4977         default:
4978                 unreachable("bad stage");
4979         }
4980 
4981         if (V3D_DBG(VIR) ||
4982             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4983                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4984                         vir_get_stage_name(c),
4985                         c->program_id, c->variant_id);
4986                 vir_dump(c);
4987                 fprintf(stderr, "\n");
4988         }
4989 
4990         vir_optimize(c);
4991         sched_flags(c);
4992 
4993         vir_check_payload_w(c);
4994 
4995         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4996          * We used that on that platform to pipeline TMU writes and reduce the
4997          * number of thread switches, as well as try (mostly successfully) to
4998          * reduce maximum register pressure to allow more threads.  We should
4999          * do something of that sort for V3D -- either instruction scheduling
5000          * here, or delay the the THRSW and LDTMUs from our texture
5001          * instructions until the results are needed.
5002          */
5003 
5004         if (V3D_DBG(VIR) ||
5005             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
5006                 fprintf(stderr, "%s prog %d/%d VIR:\n",
5007                         vir_get_stage_name(c),
5008                         c->program_id, c->variant_id);
5009                 vir_dump(c);
5010                 fprintf(stderr, "\n");
5011         }
5012 
5013         /* Attempt to allocate registers for the temporaries.  If we fail,
5014          * reduce thread count and try again.
5015          */
5016         int min_threads = 2;
5017         struct qpu_reg *temp_registers;
5018         while (true) {
5019                 temp_registers = v3d_register_allocate(c);
5020                 if (temp_registers) {
5021                         assert(c->spills + c->fills <= c->max_tmu_spills);
5022                         break;
5023                 }
5024 
5025                 if (c->threads == min_threads &&
5026                     V3D_DBG(RA)) {
5027                         fprintf(stderr,
5028                                 "Failed to register allocate using %s\n",
5029                                 c->fallback_scheduler ? "the fallback scheduler:" :
5030                                 "the normal scheduler: \n");
5031 
5032                         vir_dump(c);
5033 
5034                         char *shaderdb;
5035                         int ret = v3d_shaderdb_dump(c, &shaderdb);
5036                         if (ret > 0) {
5037                                 fprintf(stderr, "%s\n", shaderdb);
5038                                 free(shaderdb);
5039                         }
5040                 }
5041 
5042                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
5043                         if (V3D_DBG(PERF)) {
5044                                 fprintf(stderr,
5045                                         "Failed to register allocate %s "
5046                                         "prog %d/%d at %d threads.\n",
5047                                         vir_get_stage_name(c),
5048                                         c->program_id, c->variant_id, c->threads);
5049                         }
5050                         c->compilation_result =
5051                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
5052                         return;
5053                 }
5054 
5055                 c->spills = 0;
5056                 c->fills = 0;
5057                 c->threads /= 2;
5058 
5059                 if (c->threads == 1)
5060                         vir_remove_thrsw(c);
5061         }
5062 
5063         /* If we didn't spill, then remove the last thread switch we injected
5064          * artificially (if any) and restore the previous one.
5065          */
5066         if (!c->spills && c->last_thrsw != restore_last_thrsw)
5067                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
5068 
5069         if (c->spills &&
5070             (V3D_DBG(VIR) ||
5071              v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
5072                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
5073                         vir_get_stage_name(c),
5074                         c->program_id, c->variant_id);
5075                 vir_dump(c);
5076                 fprintf(stderr, "\n");
5077         }
5078 
5079         v3d_vir_to_qpu(c, temp_registers);
5080 }
5081