• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35 
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v42_pack.h"
42 
43 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53 
54 #define V3D_TSY_SET_QUORUM          0
55 #define V3D_TSY_INC_WAITERS         1
56 #define V3D_TSY_DEC_WAITERS         2
57 #define V3D_TSY_INC_QUORUM          3
58 #define V3D_TSY_DEC_QUORUM          4
59 #define V3D_TSY_FREE_ALL            5
60 #define V3D_TSY_RELEASE             6
61 #define V3D_TSY_ACQUIRE             7
62 #define V3D_TSY_WAIT                8
63 #define V3D_TSY_WAIT_INC            9
64 #define V3D_TSY_WAIT_CHECK          10
65 #define V3D_TSY_WAIT_INC_CHECK      11
66 #define V3D_TSY_WAIT_CV             12
67 #define V3D_TSY_INC_SEMAPHORE       13
68 #define V3D_TSY_DEC_SEMAPHORE       14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70 
71 enum v3d_tmu_op_type
72 {
73         V3D_TMU_OP_TYPE_REGULAR,
74         V3D_TMU_OP_TYPE_ATOMIC,
75         V3D_TMU_OP_TYPE_CACHE
76 };
77 
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81         switch(tmu_op) {
82         case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83         case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84         case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85         case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86         case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87                 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88         case V3D_TMU_OP_WRITE_UMAX:
89         case V3D_TMU_OP_WRITE_SMIN:
90         case V3D_TMU_OP_WRITE_SMAX:
91                 assert(is_write);
92                 FALLTHROUGH;
93         case V3D_TMU_OP_WRITE_AND_READ_INC:
94         case V3D_TMU_OP_WRITE_OR_READ_DEC:
95         case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96                 return V3D_TMU_OP_TYPE_ATOMIC;
97         case V3D_TMU_OP_REGULAR:
98                 return V3D_TMU_OP_TYPE_REGULAR;
99 
100         default:
101                 unreachable("Unknown tmu_op\n");
102         }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106 
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109                   struct qreg **regs,
110                   uint32_t *size,
111                   uint32_t decl_size)
112 {
113         if (*size >= decl_size)
114                 return;
115 
116         uint32_t old_size = *size;
117         *size = MAX2(*size * 2, decl_size);
118         *regs = reralloc(c, *regs, struct qreg, *size);
119         if (!*regs) {
120                 fprintf(stderr, "Malloc failure\n");
121                 abort();
122         }
123 
124         for (uint32_t i = old_size; i < *size; i++)
125                 (*regs)[i] = c->undef;
126 }
127 
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130                     struct v3d_interp_input **regs,
131                     uint32_t *size,
132                     uint32_t decl_size)
133 {
134         if (*size >= decl_size)
135                 return;
136 
137         uint32_t old_size = *size;
138         *size = MAX2(*size * 2, decl_size);
139         *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140         if (!*regs) {
141                 fprintf(stderr, "Malloc failure\n");
142                 abort();
143         }
144 
145         for (uint32_t i = old_size; i < *size; i++) {
146                 (*regs)[i].vp = c->undef;
147                 (*regs)[i].C = c->undef;
148         }
149 }
150 
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154         if (c->threads == 1)
155                 return;
156 
157         /* Always thread switch after each texture operation for now.
158          *
159          * We could do better by batching a bunch of texture fetches up and
160          * then doing one thread switch and collecting all their results
161          * afterward.
162          */
163         c->last_thrsw = vir_NOP(c);
164         c->last_thrsw->qpu.sig.thrsw = true;
165         c->last_thrsw_at_top_level = !c->in_control_flow;
166 
167         /* We need to lock the scoreboard before any tlb access happens. If this
168          * thread switch comes after we have emitted a tlb load, then it means
169          * that we can't lock on the last thread switch any more.
170          */
171         if (c->emitted_tlb_load)
172                 c->lock_scoreboard_on_first_thrsw = true;
173 }
174 
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178         if (nir_src_is_const(instr->src[src])) {
179                 int64_t add_val = nir_src_as_int(instr->src[src]);
180                 if (add_val == 1)
181                         return V3D_TMU_OP_WRITE_AND_READ_INC;
182                 else if (add_val == -1)
183                         return V3D_TMU_OP_WRITE_OR_READ_DEC;
184         }
185 
186         return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188 
189 static uint32_t
v3d_general_tmu_op_for_atomic(nir_intrinsic_instr * instr)190 v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
191 {
192         nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
193         switch (atomic_op) {
194         case nir_atomic_op_iadd:
195                 return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
196                         v3d_get_op_for_atomic_add(instr, 2) :
197                         v3d_get_op_for_atomic_add(instr, 1);
198         case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
199         case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
200         case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
201         case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
202         case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
203         case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
204         case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
205         case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
206         case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
207         default:                    unreachable("unknown atomic op");
208         }
209 }
210 
211 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)212 v3d_general_tmu_op(nir_intrinsic_instr *instr)
213 {
214         switch (instr->intrinsic) {
215         case nir_intrinsic_load_ssbo:
216         case nir_intrinsic_load_ubo:
217         case nir_intrinsic_load_uniform:
218         case nir_intrinsic_load_shared:
219         case nir_intrinsic_load_scratch:
220         case nir_intrinsic_load_global_2x32:
221         case nir_intrinsic_store_ssbo:
222         case nir_intrinsic_store_shared:
223         case nir_intrinsic_store_scratch:
224         case nir_intrinsic_store_global_2x32:
225                 return V3D_TMU_OP_REGULAR;
226 
227         case nir_intrinsic_ssbo_atomic:
228         case nir_intrinsic_ssbo_atomic_swap:
229         case nir_intrinsic_shared_atomic:
230         case nir_intrinsic_shared_atomic_swap:
231         case nir_intrinsic_global_atomic_2x32:
232         case nir_intrinsic_global_atomic_swap_2x32:
233                 return v3d_general_tmu_op_for_atomic(instr);
234 
235         default:
236                 unreachable("unknown intrinsic op");
237         }
238 }
239 
240 /**
241  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
242  * would overflow the Output TMU fifo.
243  *
244  * It is not allowed to overflow the Output fifo, however, we can overflow
245  * Input and Config fifos. Doing that makes the shader stall, but only for as
246  * long as it needs to be able to continue so it is better for pipelining to
247  * let the QPU stall on these if needed than trying to emit TMU flushes in the
248  * driver.
249  */
250 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)251 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
252 {
253         if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
254                 return true;
255 
256         return components > 0 &&
257                c->tmu.output_fifo_size + components > 16 / c->threads;
258 }
259 
260 /**
261  * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
262  * popping all TMU fifo entries.
263  */
264 void
ntq_flush_tmu(struct v3d_compile * c)265 ntq_flush_tmu(struct v3d_compile *c)
266 {
267         if (c->tmu.flush_count == 0)
268                 return;
269 
270         vir_emit_thrsw(c);
271 
272         bool emitted_tmuwt = false;
273         for (int i = 0; i < c->tmu.flush_count; i++) {
274                 if (c->tmu.flush[i].component_mask > 0) {
275                         nir_def *def = c->tmu.flush[i].def;
276                         assert(def);
277 
278                         for (int j = 0; j < 4; j++) {
279                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
280                                         ntq_store_def(c, def, j,
281                                                       vir_MOV(c, vir_LDTMU(c)));
282                                 }
283                         }
284                 } else if (!emitted_tmuwt) {
285                         vir_TMUWT(c);
286                         emitted_tmuwt = true;
287                 }
288         }
289 
290         c->tmu.output_fifo_size = 0;
291         c->tmu.flush_count = 0;
292         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
293 }
294 
295 /**
296  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
297  * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
298  * and more specifically, the output fifo, since that can't stall.
299  */
300 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_def * def,uint32_t component_mask)301 ntq_add_pending_tmu_flush(struct v3d_compile *c,
302                           nir_def *def,
303                           uint32_t component_mask)
304 {
305         const uint32_t num_components = util_bitcount(component_mask);
306         assert(!ntq_tmu_fifo_overflow(c, num_components));
307 
308         if (num_components > 0) {
309                 c->tmu.output_fifo_size += num_components;
310 
311                 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
312                 if (store != NULL) {
313                         nir_def *reg = store->src[1].ssa;
314                         _mesa_set_add(c->tmu.outstanding_regs, reg);
315                 }
316         }
317 
318         c->tmu.flush[c->tmu.flush_count].def = def;
319         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
320         c->tmu.flush_count++;
321         c->tmu.total_count++;
322 
323         if (c->disable_tmu_pipelining)
324                 ntq_flush_tmu(c);
325         else if (c->tmu.flush_count > 1)
326                 c->pipelined_any_tmu = true;
327 }
328 
329 enum emit_mode {
330     MODE_COUNT = 0,
331     MODE_EMIT,
332     MODE_LAST,
333 };
334 
335 /**
336  * For a TMU general store instruction:
337  *
338  * In MODE_COUNT mode, records the number of TMU writes required and flushes
339  * any outstanding TMU operations the instruction depends on, but it doesn't
340  * emit any actual register writes.
341  *
342  * In MODE_EMIT mode, emits the data register writes required by the
343  * instruction.
344  */
345 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)346 emit_tmu_general_store_writes(struct v3d_compile *c,
347                               enum emit_mode mode,
348                               nir_intrinsic_instr *instr,
349                               uint32_t base_const_offset,
350                               uint32_t *writemask,
351                               uint32_t *const_offset,
352                               uint32_t *type_size,
353                               uint32_t *tmu_writes)
354 {
355         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
356 
357         /* Find the first set of consecutive components that
358          * are enabled in the writemask and emit the TMUD
359          * instructions for them.
360          */
361         assert(*writemask != 0);
362         uint32_t first_component = ffs(*writemask) - 1;
363         uint32_t last_component = first_component;
364         while (*writemask & BITFIELD_BIT(last_component + 1))
365                 last_component++;
366 
367         assert(first_component <= last_component &&
368                last_component < instr->num_components);
369 
370         for (int i = first_component; i <= last_component; i++) {
371                 struct qreg data = ntq_get_src(c, instr->src[0], i);
372                 if (mode == MODE_COUNT)
373                         (*tmu_writes)++;
374                 else
375                         vir_MOV_dest(c, tmud, data);
376         }
377 
378         if (mode == MODE_EMIT) {
379                 /* Update the offset for the TMU write based on the
380                  * the first component we are writing.
381                  */
382                 *type_size = nir_src_bit_size(instr->src[0]) / 8;
383                 *const_offset =
384                         base_const_offset + first_component * (*type_size);
385 
386                 /* Clear these components from the writemask */
387                 uint32_t written_mask =
388                         BITFIELD_RANGE(first_component, *tmu_writes);
389                 (*writemask) &= ~written_mask;
390         }
391 }
392 
393 /**
394  * For a TMU general atomic instruction:
395  *
396  * In MODE_COUNT mode, records the number of TMU writes required and flushes
397  * any outstanding TMU operations the instruction depends on, but it doesn't
398  * emit any actual register writes.
399  *
400  * In MODE_EMIT mode, emits the data register writes required by the
401  * instruction.
402  */
403 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)404 emit_tmu_general_atomic_writes(struct v3d_compile *c,
405                                enum emit_mode mode,
406                                nir_intrinsic_instr *instr,
407                                uint32_t tmu_op,
408                                bool has_index,
409                                uint32_t *tmu_writes)
410 {
411         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
412 
413         struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
414         if (mode == MODE_COUNT)
415                 (*tmu_writes)++;
416         else
417                 vir_MOV_dest(c, tmud, data);
418 
419         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
420                 data = ntq_get_src(c, instr->src[2 + has_index], 0);
421                 if (mode == MODE_COUNT)
422                         (*tmu_writes)++;
423                 else
424                         vir_MOV_dest(c, tmud, data);
425         }
426 }
427 
428 /**
429  * For any TMU general instruction:
430  *
431  * In MODE_COUNT mode, records the number of TMU writes required to emit the
432  * address parameter and flushes any outstanding TMU operations the instruction
433  * depends on, but it doesn't emit any actual register writes.
434  *
435  * In MODE_EMIT mode, emits register writes required to emit the address.
436  */
437 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t dest_components,uint32_t * tmu_writes)438 emit_tmu_general_address_write(struct v3d_compile *c,
439                                enum emit_mode mode,
440                                nir_intrinsic_instr *instr,
441                                uint32_t config,
442                                bool dynamic_src,
443                                int offset_src,
444                                struct qreg base_offset,
445                                uint32_t const_offset,
446                                uint32_t dest_components,
447                                uint32_t *tmu_writes)
448 {
449         if (mode == MODE_COUNT) {
450                 (*tmu_writes)++;
451                 if (dynamic_src)
452                         ntq_get_src(c, instr->src[offset_src], 0);
453                 return;
454         }
455 
456         if (vir_in_nonuniform_control_flow(c)) {
457                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
458                            V3D_QPU_PF_PUSHZ);
459         }
460 
461         struct qreg tmua;
462         if (config == ~0)
463                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
464         else
465                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
466 
467         struct qinst *tmu;
468         if (dynamic_src) {
469                 struct qreg offset = base_offset;
470                 if (const_offset != 0) {
471                         offset = vir_ADD(c, offset,
472                                          vir_uniform_ui(c, const_offset));
473                 }
474                 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
475                 tmu = vir_ADD_dest(c, tmua, offset, data);
476         } else {
477                 if (const_offset != 0) {
478                         tmu = vir_ADD_dest(c, tmua, base_offset,
479                                            vir_uniform_ui(c, const_offset));
480                 } else {
481                         tmu = vir_MOV_dest(c, tmua, base_offset);
482                 }
483         }
484 
485         if (config != ~0) {
486                 tmu->uniform =
487                         vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
488         }
489 
490         if (vir_in_nonuniform_control_flow(c))
491                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
492 
493         tmu->ldtmu_count = dest_components;
494 }
495 
496 /**
497  * Implements indirect uniform loads and SSBO accesses through the TMU general
498  * memory access interface.
499  */
500 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch,bool is_global)501 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
502                      bool is_shared_or_scratch, bool is_global)
503 {
504         uint32_t tmu_op = v3d_general_tmu_op(instr);
505 
506         /* If we were able to replace atomic_add for an inc/dec, then we
507          * need/can to do things slightly different, like not loading the
508          * amount to add/sub, as that is implicit.
509          */
510         bool atomic_add_replaced =
511                 (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
512                  instr->intrinsic == nir_intrinsic_shared_atomic ||
513                  instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
514                 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
515                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
516                   tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
517 
518         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
519                          instr->intrinsic == nir_intrinsic_store_scratch ||
520                          instr->intrinsic == nir_intrinsic_store_shared ||
521                          instr->intrinsic == nir_intrinsic_store_global_2x32);
522 
523         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
524                         instr->intrinsic == nir_intrinsic_load_ubo ||
525                         instr->intrinsic == nir_intrinsic_load_ssbo ||
526                         instr->intrinsic == nir_intrinsic_load_scratch ||
527                         instr->intrinsic == nir_intrinsic_load_shared ||
528                         instr->intrinsic == nir_intrinsic_load_global_2x32);
529 
530         if (!is_load)
531                 c->tmu_dirty_rcl = true;
532 
533         if (is_global)
534                 c->has_global_address = true;
535 
536         bool has_index = !is_shared_or_scratch && !is_global;
537 
538         int offset_src;
539         if (instr->intrinsic == nir_intrinsic_load_uniform) {
540                 offset_src = 0;
541         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
542                    instr->intrinsic == nir_intrinsic_load_ubo ||
543                    instr->intrinsic == nir_intrinsic_load_scratch ||
544                    instr->intrinsic == nir_intrinsic_load_shared ||
545                    instr->intrinsic == nir_intrinsic_load_global_2x32 ||
546                    atomic_add_replaced) {
547                 offset_src = 0 + has_index;
548         } else if (is_store) {
549                 offset_src = 1 + has_index;
550         } else {
551                 offset_src = 0 + has_index;
552         }
553 
554         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
555         uint32_t const_offset = 0;
556         if (!dynamic_src)
557                 const_offset = nir_src_as_uint(instr->src[offset_src]);
558 
559         struct qreg base_offset;
560         if (instr->intrinsic == nir_intrinsic_load_uniform) {
561                 const_offset += nir_intrinsic_base(instr);
562                 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
563                                           v3d_unit_data_create(0, const_offset));
564                 const_offset = 0;
565         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
566                 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
567                  * is gallium's constant buffer 0 in GL and push constants
568                  * in Vulkan)).
569                  */
570                 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
571                 base_offset =
572                         vir_uniform(c, QUNIFORM_UBO_ADDR,
573                                     v3d_unit_data_create(index, const_offset));
574                 const_offset = 0;
575         } else if (is_shared_or_scratch) {
576                 /* Shared and scratch variables have no buffer index, and all
577                  * start from a common base that we set up at the start of
578                  * dispatch.
579                  */
580                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
581                     instr->intrinsic == nir_intrinsic_store_scratch) {
582                         base_offset = c->spill_base;
583                 } else {
584                         base_offset = c->cs_shared_offset;
585                         const_offset += nir_intrinsic_base(instr);
586                 }
587         } else if (is_global) {
588                 /* Global load/store intrinsics use gloal addresses, so the
589                  * offset is the target address and we don't need to add it
590                  * to a base offset.
591                  */
592                 base_offset = vir_uniform_ui(c, 0);
593         } else {
594                 uint32_t idx = is_store ? 1 : 0;
595                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
596                                           nir_src_comp_as_uint(instr->src[idx], 0));
597         }
598 
599         /* We are ready to emit TMU register writes now, but before we actually
600          * emit them we need to flush outstanding TMU operations if any of our
601          * writes reads from the result of an outstanding TMU operation before
602          * we start the TMU sequence for this operation, since otherwise the
603          * flush could happen in the middle of the TMU sequence we are about to
604          * emit, which is illegal. To do this we run this logic twice, the
605          * first time it will count required register writes and flush pending
606          * TMU requests if necessary due to a dependency, and the second one
607          * will emit the actual TMU writes.
608          */
609         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
610         uint32_t base_const_offset = const_offset;
611         uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
612         uint32_t tmu_writes = 0;
613         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
614                 assert(mode == MODE_COUNT || tmu_writes > 0);
615 
616                 uint32_t type_size = 4;
617 
618                 if (is_store) {
619                         emit_tmu_general_store_writes(c, mode, instr,
620                                                       base_const_offset,
621                                                       &writemask,
622                                                       &const_offset,
623                                                       &type_size,
624                                                       &tmu_writes);
625                 } else if (!is_load && !atomic_add_replaced) {
626                         emit_tmu_general_atomic_writes(c, mode, instr,
627                                                        tmu_op, has_index,
628                                                        &tmu_writes);
629                 } else if (is_load) {
630                         type_size = instr->def.bit_size / 8;
631                 }
632 
633                 /* For atomics we use 32bit except for CMPXCHG, that we need
634                  * to use VEC2. For the rest of the cases we use the number of
635                  * tmud writes we did to decide the type. For cache operations
636                  * the type is ignored.
637                  */
638                 uint32_t config = 0;
639                 if (mode == MODE_EMIT) {
640                         uint32_t num_components;
641                         if (is_load || atomic_add_replaced) {
642                                 num_components = instr->num_components;
643                         } else {
644                                 assert(tmu_writes > 0);
645                                 num_components = tmu_writes - 1;
646                         }
647                         bool is_atomic =
648                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
649                                 V3D_TMU_OP_TYPE_ATOMIC;
650 
651                         /* Only load per-quad if we can be certain that all
652                          * lines in the quad are active. Notice that demoted
653                          * invocations, unlike terminated ones, are still
654                          * active: we want to skip memory writes for them but
655                          * loads should still work.
656                          */
657                         uint32_t perquad =
658                                 is_load && !vir_in_nonuniform_control_flow(c) &&
659                                 !c->emitted_discard ?
660                                 GENERAL_TMU_LOOKUP_PER_QUAD :
661                                 GENERAL_TMU_LOOKUP_PER_PIXEL;
662                         config = 0xffffff00 | tmu_op << 3 | perquad;
663 
664                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
665                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
666                         } else if (is_atomic || num_components == 1) {
667                                 switch (type_size) {
668                                 case 4:
669                                         config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
670                                         break;
671                                 case 2:
672                                         config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
673                                         break;
674                                 case 1:
675                                         config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
676                                         break;
677                                 default:
678                                         unreachable("Unsupported bitsize");
679                                 }
680                         } else {
681                                 assert(type_size == 4);
682                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
683                                           num_components - 2;
684                         }
685                 }
686 
687                 emit_tmu_general_address_write(c, mode, instr, config,
688                                                dynamic_src, offset_src,
689                                                base_offset, const_offset,
690                                                dest_components, &tmu_writes);
691 
692                 assert(tmu_writes > 0);
693                 if (mode == MODE_COUNT) {
694                         /* Make sure we won't exceed the 16-entry TMU
695                          * fifo if each thread is storing at the same
696                          * time.
697                          */
698                         while (tmu_writes > 16 / c->threads)
699                                 c->threads /= 2;
700 
701                         /* If pipelining this TMU operation would
702                          * overflow TMU fifos, we need to flush.
703                          */
704                         if (ntq_tmu_fifo_overflow(c, dest_components))
705                                 ntq_flush_tmu(c);
706                 } else {
707                         /* Delay emission of the thread switch and
708                          * LDTMU/TMUWT until we really need to do it to
709                          * improve pipelining.
710                          */
711                         const uint32_t component_mask =
712                                 (1 << dest_components) - 1;
713                         ntq_add_pending_tmu_flush(c, &instr->def,
714                                                   component_mask);
715                 }
716         }
717 
718         /* nir_lower_wrmasks should've ensured that any writemask on a store
719          * operation only has consecutive bits set, in which case we should've
720          * processed the full writemask above.
721          */
722         assert(writemask == 0);
723 }
724 
725 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_def * def)726 ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
727 {
728         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
729                                           def->num_components);
730         _mesa_hash_table_insert(c->def_ht, def, qregs);
731         return qregs;
732 }
733 
734 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)735 is_ld_signal(const struct v3d_qpu_sig *sig)
736 {
737         return (sig->ldunif ||
738                 sig->ldunifa ||
739                 sig->ldunifrf ||
740                 sig->ldunifarf ||
741                 sig->ldtmu ||
742                 sig->ldvary ||
743                 sig->ldvpm ||
744                 sig->ldtlb ||
745                 sig->ldtlbu);
746 }
747 
748 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)749 is_ldunif_signal(const struct v3d_qpu_sig *sig)
750 {
751         return sig->ldunif || sig->ldunifrf;
752 }
753 
754 /**
755  * This function is responsible for getting VIR results into the associated
756  * storage for a NIR instruction.
757  *
758  * If it's a NIR SSA def, then we just set the associated hash table entry to
759  * the new result.
760  *
761  * If it's a NIR reg, then we need to update the existing qreg assigned to the
762  * NIR destination with the incoming value.  To do that without introducing
763  * new MOVs, we require that the incoming qreg either be a uniform, or be
764  * SSA-defined by the previous VIR instruction in the block and rewritable by
765  * this function.  That lets us sneak ahead and insert the SF flag beforehand
766  * (knowing that the previous instruction doesn't depend on flags) and rewrite
767  * its destination to be the NIR reg's destination
768  */
769 void
ntq_store_def(struct v3d_compile * c,nir_def * def,int chan,struct qreg result)770 ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
771               struct qreg result)
772 {
773         struct qinst *last_inst = NULL;
774         if (!list_is_empty(&c->cur_block->instructions))
775                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
776 
777         bool is_reused_uniform =
778                 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
779                 last_inst != c->defs[result.index];
780 
781         assert(result.file == QFILE_TEMP && last_inst &&
782                (last_inst == c->defs[result.index] || is_reused_uniform));
783 
784         nir_intrinsic_instr *store = nir_store_reg_for_def(def);
785         if (store == NULL) {
786                 assert(chan < def->num_components);
787 
788                 struct qreg *qregs;
789                 struct hash_entry *entry =
790                         _mesa_hash_table_search(c->def_ht, def);
791 
792                 if (entry)
793                         qregs = entry->data;
794                 else
795                         qregs = ntq_init_ssa_def(c, def);
796 
797                 qregs[chan] = result;
798         } else {
799                 nir_def *reg = store->src[1].ssa;
800                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
801                 assert(nir_intrinsic_base(store) == 0);
802                 assert(nir_intrinsic_num_array_elems(decl) == 0);
803                 struct hash_entry *entry =
804                         _mesa_hash_table_search(c->def_ht, reg);
805                 struct qreg *qregs = entry->data;
806 
807                 /* If the previous instruction can't be predicated for
808                  * the store into the nir_register, then emit a MOV
809                  * that can be.
810                  */
811                 if (is_reused_uniform ||
812                     (vir_in_nonuniform_control_flow(c) &&
813                      is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
814                         result = vir_MOV(c, result);
815                         last_inst = c->defs[result.index];
816                 }
817 
818                 /* We know they're both temps, so just rewrite index. */
819                 c->defs[last_inst->dst.index] = NULL;
820                 last_inst->dst.index = qregs[chan].index;
821 
822                 /* If we're in control flow, then make this update of the reg
823                  * conditional on the execution mask.
824                  */
825                 if (vir_in_nonuniform_control_flow(c)) {
826                         last_inst->dst.index = qregs[chan].index;
827 
828                         /* Set the flags to the current exec mask.
829                          */
830                         c->cursor = vir_before_inst(last_inst);
831                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
832                                    V3D_QPU_PF_PUSHZ);
833                         c->cursor = vir_after_inst(last_inst);
834 
835                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
836                 }
837         }
838 }
839 
840 /**
841  * This looks up the qreg associated with a particular ssa/reg used as a source
842  * in any instruction.
843  *
844  * It is expected that the definition for any NIR value read as a source has
845  * been emitted by a previous instruction, however, in the case of TMU
846  * operations we may have postponed emission of the thread switch and LDTMUs
847  * required to read the TMU results until the results are actually used to
848  * improve pipelining, which then would lead to us not finding them here
849  * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
850  * (for registers), meaning that we need to flush outstanding TMU operations
851  * to read the correct value.
852  */
853 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)854 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
855 {
856         struct hash_entry *entry;
857 
858         nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
859         if (load == NULL) {
860                 assert(i < src.ssa->num_components);
861 
862                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
863                 if (!entry) {
864                         ntq_flush_tmu(c);
865                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
866                 }
867         } else {
868                 nir_def *reg = load->src[0].ssa;
869                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
870                 assert(nir_intrinsic_base(load) == 0);
871                 assert(nir_intrinsic_num_array_elems(decl) == 0);
872                 assert(i < nir_intrinsic_num_components(decl));
873 
874                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
875                         ntq_flush_tmu(c);
876                 entry = _mesa_hash_table_search(c->def_ht, reg);
877         }
878         assert(entry);
879 
880         struct qreg *qregs = entry->data;
881         return qregs[i];
882 }
883 
884 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)885 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
886                 unsigned src)
887 {
888         struct qreg r = ntq_get_src(c, instr->src[src].src,
889                                     instr->src[src].swizzle[0]);
890 
891         return r;
892 };
893 
894 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)895 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
896 {
897         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
898 }
899 
900 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)901 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
902 {
903         unsigned unit = instr->texture_index;
904         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
905         int dest_size = nir_tex_instr_dest_size(instr);
906 
907         struct qreg lod = c->undef;
908         if (lod_index != -1)
909                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
910 
911         for (int i = 0; i < dest_size; i++) {
912                 assert(i < 3);
913                 enum quniform_contents contents;
914 
915                 if (instr->is_array && i == dest_size - 1)
916                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
917                 else
918                         contents = QUNIFORM_TEXTURE_WIDTH + i;
919 
920                 struct qreg size = vir_uniform(c, contents, unit);
921 
922                 switch (instr->sampler_dim) {
923                 case GLSL_SAMPLER_DIM_1D:
924                 case GLSL_SAMPLER_DIM_2D:
925                 case GLSL_SAMPLER_DIM_MS:
926                 case GLSL_SAMPLER_DIM_3D:
927                 case GLSL_SAMPLER_DIM_CUBE:
928                 case GLSL_SAMPLER_DIM_BUF:
929                 case GLSL_SAMPLER_DIM_EXTERNAL:
930                         /* Don't minify the array size. */
931                         if (!(instr->is_array && i == dest_size - 1)) {
932                                 size = ntq_minify(c, size, lod);
933                         }
934                         break;
935 
936                 case GLSL_SAMPLER_DIM_RECT:
937                         /* There's no LOD field for rects */
938                         break;
939 
940                 default:
941                         unreachable("Bad sampler type");
942                 }
943 
944                 ntq_store_def(c, &instr->def, i, size);
945         }
946 }
947 
948 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)949 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
950 {
951         unsigned unit = instr->texture_index;
952 
953         /* Since each texture sampling op requires uploading uniforms to
954          * reference the texture, there's no HW support for texture size and
955          * you just upload uniforms containing the size.
956          */
957         switch (instr->op) {
958         case nir_texop_query_levels:
959                 ntq_store_def(c, &instr->def, 0,
960                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
961                 return;
962         case nir_texop_texture_samples:
963                 ntq_store_def(c, &instr->def, 0,
964                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
965                 return;
966         case nir_texop_txs:
967                 ntq_emit_txs(c, instr);
968                 return;
969         default:
970                 break;
971         }
972 
973         v3d_vir_emit_tex(c, instr);
974 }
975 
976 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)977 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
978 {
979         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
980         if (is_cos)
981                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
982 
983         struct qreg periods = vir_FROUND(c, input);
984         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
985         return vir_XOR(c, sin_output, vir_SHL(c,
986                                               vir_FTOIN(c, periods),
987                                               vir_uniform_ui(c, -1)));
988 }
989 
990 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)991 ntq_fsign(struct v3d_compile *c, struct qreg src)
992 {
993         struct qreg t = vir_get_temp(c);
994 
995         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
996         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
997         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
998         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
999         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
1000         return vir_MOV(c, t);
1001 }
1002 
1003 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)1004 emit_fragcoord_input(struct v3d_compile *c, int attr)
1005 {
1006         c->inputs[attr * 4 + 0] = vir_FXCD(c);
1007         c->inputs[attr * 4 + 1] = vir_FYCD(c);
1008         c->inputs[attr * 4 + 2] = c->payload_z;
1009         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
1010 }
1011 
1012 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg c_reg)1013 emit_smooth_varying(struct v3d_compile *c,
1014                     struct qreg vary, struct qreg w, struct qreg c_reg)
1015 {
1016         return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
1017 }
1018 
1019 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1020 emit_noperspective_varying(struct v3d_compile *c,
1021                            struct qreg vary, struct qreg c_reg)
1022 {
1023         return vir_FADD(c, vir_MOV(c, vary), c_reg);
1024 }
1025 
1026 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1027 emit_flat_varying(struct v3d_compile *c,
1028                   struct qreg vary, struct qreg c_reg)
1029 {
1030         vir_MOV_dest(c, c->undef, vary);
1031         return vir_MOV(c, c_reg);
1032 }
1033 
1034 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1035 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1036                       int8_t input_idx, uint8_t swizzle, int array_index)
1037 {
1038         struct qreg c_reg; /* C coefficient */
1039 
1040         if (c->devinfo->has_accumulators)
1041                 c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1042         else
1043                 c_reg = vir_reg(QFILE_REG, 0);
1044 
1045         struct qinst *ldvary = NULL;
1046         struct qreg vary;
1047         ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1048                               c->undef, c->undef);
1049         ldvary->qpu.sig.ldvary = true;
1050         vary = vir_emit_def(c, ldvary);
1051 
1052         /* Store the input value before interpolation so we can implement
1053          * GLSL's interpolateAt functions if the shader uses them.
1054          */
1055         if (input_idx >= 0) {
1056                 assert(var);
1057                 c->interp[input_idx].vp = vary;
1058                 c->interp[input_idx].C = vir_MOV(c, c_reg);
1059                 c->interp[input_idx].mode = var->data.interpolation;
1060         }
1061 
1062         /* For gl_PointCoord input or distance along a line, we'll be called
1063          * with no nir_variable, and we don't count toward VPM size so we
1064          * don't track an input slot.
1065          */
1066         if (!var) {
1067                 assert(input_idx < 0);
1068                 return emit_smooth_varying(c, vary, c->payload_w, c_reg);
1069         }
1070 
1071         int i = c->num_inputs++;
1072         c->input_slots[i] =
1073                 v3d_slot_from_slot_and_component(var->data.location +
1074                                                  array_index, swizzle);
1075 
1076         struct qreg result;
1077         switch (var->data.interpolation) {
1078         case INTERP_MODE_NONE:
1079         case INTERP_MODE_SMOOTH:
1080                 if (var->data.centroid) {
1081                         BITSET_SET(c->centroid_flags, i);
1082                         result = emit_smooth_varying(c, vary,
1083                                                      c->payload_w_centroid, c_reg);
1084                 } else {
1085                         result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
1086                 }
1087                 break;
1088 
1089         case INTERP_MODE_NOPERSPECTIVE:
1090                 BITSET_SET(c->noperspective_flags, i);
1091                 result = emit_noperspective_varying(c, vary, c_reg);
1092                 break;
1093 
1094         case INTERP_MODE_FLAT:
1095                 BITSET_SET(c->flat_shade_flags, i);
1096                 result = emit_flat_varying(c, vary, c_reg);
1097                 break;
1098 
1099         default:
1100                 unreachable("Bad interp mode");
1101         }
1102 
1103         if (input_idx >= 0)
1104                 c->inputs[input_idx] = result;
1105         return result;
1106 }
1107 
1108 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1109 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1110                     int array_index, unsigned nelem)
1111 {
1112         for (int i = 0; i < nelem ; i++) {
1113                 int chan = var->data.location_frac + i;
1114                 int input_idx = (base_attr + array_index) * 4 + chan;
1115                 emit_fragment_varying(c, var, input_idx, chan, array_index);
1116         }
1117 }
1118 
1119 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1120 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1121                             int array_index)
1122 {
1123         /* Compact variables are scalar arrays where each set of 4 elements
1124          * consumes a single location.
1125          */
1126         int loc_offset = array_index / 4;
1127         int chan = var->data.location_frac + array_index % 4;
1128         int input_idx = (attr + loc_offset) * 4  + chan;
1129         emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1130 }
1131 
1132 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1133 add_output(struct v3d_compile *c,
1134            uint32_t decl_offset,
1135            uint8_t slot,
1136            uint8_t swizzle)
1137 {
1138         uint32_t old_array_size = c->outputs_array_size;
1139         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1140                           decl_offset + 1);
1141 
1142         if (old_array_size != c->outputs_array_size) {
1143                 c->output_slots = reralloc(c,
1144                                            c->output_slots,
1145                                            struct v3d_varying_slot,
1146                                            c->outputs_array_size);
1147         }
1148 
1149         c->output_slots[decl_offset] =
1150                 v3d_slot_from_slot_and_component(slot, swizzle);
1151 }
1152 
1153 /**
1154  * If compare_instr is a valid comparison instruction, emits the
1155  * compare_instr's comparison and returns the sel_instr's return value based
1156  * on the compare_instr's result.
1157  */
1158 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1159 ntq_emit_comparison(struct v3d_compile *c,
1160                     nir_alu_instr *compare_instr,
1161                     enum v3d_qpu_cond *out_cond)
1162 {
1163         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1164         struct qreg src1;
1165         if (nir_op_infos[compare_instr->op].num_inputs > 1)
1166                 src1 = ntq_get_alu_src(c, compare_instr, 1);
1167         bool cond_invert = false;
1168         struct qreg nop = vir_nop_reg();
1169 
1170         switch (compare_instr->op) {
1171         case nir_op_feq32:
1172         case nir_op_seq:
1173                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1174                 break;
1175         case nir_op_ieq32:
1176                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1177                 break;
1178 
1179         case nir_op_fneu32:
1180         case nir_op_sne:
1181                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1182                 cond_invert = true;
1183                 break;
1184         case nir_op_ine32:
1185                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1186                 cond_invert = true;
1187                 break;
1188 
1189         case nir_op_fge32:
1190         case nir_op_sge:
1191                 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1192                 break;
1193         case nir_op_ige32:
1194                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1195                 cond_invert = true;
1196                 break;
1197         case nir_op_uge32:
1198                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1199                 cond_invert = true;
1200                 break;
1201 
1202         case nir_op_slt:
1203         case nir_op_flt32:
1204                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1205                 break;
1206         case nir_op_ilt32:
1207                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1208                 break;
1209         case nir_op_ult32:
1210                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1211                 break;
1212 
1213         default:
1214                 return false;
1215         }
1216 
1217         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1218 
1219         return true;
1220 }
1221 
1222 /* Finds an ALU instruction that generates our src value that could
1223  * (potentially) be greedily emitted in the consuming instruction.
1224  */
1225 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1226 ntq_get_alu_parent(nir_src src)
1227 {
1228         if (src.ssa->parent_instr->type != nir_instr_type_alu)
1229                 return NULL;
1230         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1231         if (!instr)
1232                 return NULL;
1233 
1234         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1235          * moving emission of the ALU instr down past another write of the
1236          * src.
1237          */
1238         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1239                 if (nir_load_reg_for_def(instr->src[i].src.ssa))
1240                         return NULL;
1241         }
1242 
1243         return instr;
1244 }
1245 
1246 /* Turns a NIR bool into a condition code to predicate on. */
1247 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1248 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1249 {
1250         struct qreg qsrc = ntq_get_src(c, src, 0);
1251         /* skip if we already have src in the flags */
1252         if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1253                 return c->flags_cond;
1254 
1255         nir_alu_instr *compare = ntq_get_alu_parent(src);
1256         if (!compare)
1257                 goto out;
1258 
1259         enum v3d_qpu_cond cond;
1260         if (ntq_emit_comparison(c, compare, &cond))
1261                 return cond;
1262 
1263 out:
1264 
1265         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1266                    V3D_QPU_PF_PUSHZ);
1267         return V3D_QPU_COND_IFNA;
1268 }
1269 
1270 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1271 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1272 {
1273         struct qreg result =
1274                 vir_MOV(c, vir_SEL(c, cond,
1275                                    vir_uniform_ui(c, ~0),
1276                                    vir_uniform_ui(c, 0)));
1277         c->flags_temp = result.index;
1278         c->flags_cond = cond;
1279         return result;
1280 }
1281 
1282 static struct qreg
ntq_emit_cond_to_int(struct v3d_compile * c,enum v3d_qpu_cond cond)1283 ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
1284 {
1285         struct qreg result =
1286                 vir_MOV(c, vir_SEL(c, cond,
1287                                    vir_uniform_ui(c, 1),
1288                                    vir_uniform_ui(c, 0)));
1289         c->flags_temp = result.index;
1290         c->flags_cond = cond;
1291         return result;
1292 }
1293 
1294 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1295 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1296 {
1297    /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1298     * method and seems to be using RTE by default, so we need to implement
1299     * RTZ rounding in software.
1300     */
1301    struct qreg rf16 = vir_FMOV(c, f32);
1302    vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
1303 
1304    struct qreg rf32 = vir_FMOV(c, rf16);
1305    vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
1306 
1307    struct qreg f32_abs = vir_FMOV(c, f32);
1308    vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1309 
1310    struct qreg rf32_abs = vir_FMOV(c, rf32);
1311    vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1312 
1313    vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
1314               V3D_QPU_PF_PUSHN);
1315    return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1316                   vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
1317 }
1318 
1319 /**
1320  * Takes the result value of a signed integer width conversion from a smaller
1321  * type to a larger type and if needed, it applies sign extension to it.
1322  */
1323 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1324 sign_extend(struct v3d_compile *c,
1325             struct qreg value,
1326             uint32_t src_bit_size,
1327             uint32_t dst_bit_size)
1328 {
1329         assert(src_bit_size < dst_bit_size);
1330 
1331         struct qreg tmp = vir_MOV(c, value);
1332 
1333         /* Do we need to sign-extend? */
1334         uint32_t sign_mask = 1 << (src_bit_size - 1);
1335         struct qinst *sign_check =
1336                 vir_AND_dest(c, vir_nop_reg(),
1337                              tmp, vir_uniform_ui(c, sign_mask));
1338         vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1339 
1340         /* If so, fill in leading sign bits */
1341         uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1342                                ((1ull << dst_bit_size) - 1);
1343         struct qinst *extend_inst =
1344                 vir_OR_dest(c, tmp, tmp,
1345                             vir_uniform_ui(c, extend_bits));
1346         vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1347 
1348         return tmp;
1349 }
1350 
1351 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1352 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1353 {
1354         /* Vectors are special in that they have non-scalarized writemasks,
1355          * and just take the first swizzle channel for each argument in order
1356          * into each writemask channel.
1357          */
1358         if (instr->op == nir_op_vec2 ||
1359             instr->op == nir_op_vec3 ||
1360             instr->op == nir_op_vec4) {
1361                 struct qreg srcs[4];
1362                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1363                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1364                                               instr->src[i].swizzle[0]);
1365                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1366                         ntq_store_def(c, &instr->def, i,
1367                                       vir_MOV(c, srcs[i]));
1368                 return;
1369         }
1370 
1371         /* General case: We can just grab the one used channel per src. */
1372         struct qreg src[nir_op_infos[instr->op].num_inputs];
1373         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1374                 src[i] = ntq_get_alu_src(c, instr, i);
1375         }
1376 
1377         struct qreg result;
1378 
1379         switch (instr->op) {
1380         case nir_op_mov:
1381                 result = vir_MOV(c, src[0]);
1382                 break;
1383 
1384         case nir_op_fneg:
1385                 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
1386                 break;
1387         case nir_op_ineg:
1388                 result = vir_NEG(c, src[0]);
1389                 break;
1390 
1391         case nir_op_fmul:
1392                 result = vir_FMUL(c, src[0], src[1]);
1393                 break;
1394         case nir_op_fadd:
1395                 result = vir_FADD(c, src[0], src[1]);
1396                 break;
1397         case nir_op_fsub:
1398                 result = vir_FSUB(c, src[0], src[1]);
1399                 break;
1400         case nir_op_fmin:
1401                 result = vir_FMIN(c, src[0], src[1]);
1402                 break;
1403         case nir_op_fmax:
1404                 result = vir_FMAX(c, src[0], src[1]);
1405                 break;
1406 
1407         case nir_op_f2i32: {
1408                 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1409                 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1410                         result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1411                 } else {
1412                         result = vir_FTOIZ(c, src[0]);
1413                 }
1414                 break;
1415         }
1416 
1417         case nir_op_f2u32:
1418                 result = vir_FTOUZ(c, src[0]);
1419                 break;
1420         case nir_op_i2f32:
1421                 result = vir_ITOF(c, src[0]);
1422                 break;
1423         case nir_op_u2f32:
1424                 result = vir_UTOF(c, src[0]);
1425                 break;
1426         case nir_op_b2f32:
1427                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1428                 break;
1429         case nir_op_b2i32:
1430                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1431                 break;
1432 
1433         case nir_op_f2f16:
1434         case nir_op_f2f16_rtne:
1435                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1436                 result = vir_FMOV(c, src[0]);
1437                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1438                 break;
1439 
1440         case nir_op_f2f16_rtz:
1441                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1442                 result = f2f16_rtz(c, src[0]);
1443                 break;
1444 
1445         case nir_op_f2f32:
1446                 assert(nir_src_bit_size(instr->src[0].src) == 16);
1447                 result = vir_FMOV(c, src[0]);
1448                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1449                 break;
1450 
1451         case nir_op_i2i16: {
1452                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1453                 assert(bit_size == 32 || bit_size == 8);
1454                 if (bit_size == 32) {
1455                         /* We don't have integer pack/unpack methods for
1456                          * converting between 16-bit and 32-bit, so we implement
1457                          * the conversion manually by truncating the src.
1458                          */
1459                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1460                 } else {
1461                         struct qreg tmp = vir_AND(c, src[0],
1462                                                   vir_uniform_ui(c, 0xff));
1463                         result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1464                 }
1465                 break;
1466         }
1467 
1468         case nir_op_u2u16: {
1469                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1470                 assert(bit_size == 32 || bit_size == 8);
1471 
1472                 /* We don't have integer pack/unpack methods for converting
1473                  * between 16-bit and 32-bit, so we implement the conversion
1474                  * manually by truncating the src. For the 8-bit case, we
1475                  * want to make sure we don't copy garbage from any of the
1476                  * 24 MSB bits.
1477                  */
1478                 if (bit_size == 32)
1479                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1480                 else
1481                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1482                 break;
1483         }
1484 
1485         case nir_op_i2i8:
1486         case nir_op_u2u8:
1487                 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1488                        nir_src_bit_size(instr->src[0].src) == 16);
1489                 /* We don't have integer pack/unpack methods for converting
1490                  * between 8-bit and 32-bit, so we implement the conversion
1491                  * manually by truncating the src.
1492                  */
1493                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1494                 break;
1495 
1496         case nir_op_u2u32: {
1497                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1498                 assert(bit_size == 16 || bit_size == 8);
1499 
1500                 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1501                  * from the src but we make sure to clear any garbage bits that
1502                  * may be present in the invalid src bits.
1503                  */
1504                 uint32_t mask = (1 << bit_size) - 1;
1505                 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1506                 break;
1507         }
1508 
1509         case nir_op_i2i32: {
1510                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1511                 assert(bit_size == 16 || bit_size == 8);
1512 
1513                 uint32_t mask = (1 << bit_size) - 1;
1514                 struct qreg tmp = vir_AND(c, src[0],
1515                                           vir_uniform_ui(c, mask));
1516 
1517                 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1518                 break;
1519         }
1520 
1521         case nir_op_iadd:
1522                 result = vir_ADD(c, src[0], src[1]);
1523                 break;
1524         case nir_op_ushr:
1525                 result = vir_SHR(c, src[0], src[1]);
1526                 break;
1527         case nir_op_isub:
1528                 result = vir_SUB(c, src[0], src[1]);
1529                 break;
1530         case nir_op_ishr:
1531                 result = vir_ASR(c, src[0], src[1]);
1532                 break;
1533         case nir_op_ishl:
1534                 result = vir_SHL(c, src[0], src[1]);
1535                 break;
1536         case nir_op_imin:
1537                 result = vir_MIN(c, src[0], src[1]);
1538                 break;
1539         case nir_op_umin:
1540                 result = vir_UMIN(c, src[0], src[1]);
1541                 break;
1542         case nir_op_imax:
1543                 result = vir_MAX(c, src[0], src[1]);
1544                 break;
1545         case nir_op_umax:
1546                 result = vir_UMAX(c, src[0], src[1]);
1547                 break;
1548         case nir_op_iand:
1549                 result = vir_AND(c, src[0], src[1]);
1550                 break;
1551         case nir_op_ior:
1552                 result = vir_OR(c, src[0], src[1]);
1553                 break;
1554         case nir_op_ixor:
1555                 result = vir_XOR(c, src[0], src[1]);
1556                 break;
1557         case nir_op_inot:
1558                 result = vir_NOT(c, src[0]);
1559                 break;
1560 
1561         case nir_op_ufind_msb:
1562                 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1563                 break;
1564 
1565         case nir_op_imul:
1566                 result = vir_UMUL(c, src[0], src[1]);
1567                 break;
1568 
1569         case nir_op_seq:
1570         case nir_op_sne:
1571         case nir_op_sge:
1572         case nir_op_slt: {
1573                 enum v3d_qpu_cond cond;
1574                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1575                 assert(ok);
1576                 result = vir_MOV(c, vir_SEL(c, cond,
1577                                             vir_uniform_f(c, 1.0),
1578                                             vir_uniform_f(c, 0.0)));
1579                 c->flags_temp = result.index;
1580                 c->flags_cond = cond;
1581                 break;
1582         }
1583 
1584         case nir_op_feq32:
1585         case nir_op_fneu32:
1586         case nir_op_fge32:
1587         case nir_op_flt32:
1588         case nir_op_ieq32:
1589         case nir_op_ine32:
1590         case nir_op_ige32:
1591         case nir_op_uge32:
1592         case nir_op_ilt32:
1593         case nir_op_ult32: {
1594                 enum v3d_qpu_cond cond;
1595                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1596                 assert(ok);
1597                 result = ntq_emit_cond_to_bool(c, cond);
1598                 break;
1599         }
1600 
1601         case nir_op_b32csel:
1602                 result = vir_MOV(c,
1603                                  vir_SEL(c,
1604                                          ntq_emit_bool_to_cond(c, instr->src[0].src),
1605                                          src[1], src[2]));
1606                 break;
1607 
1608         case nir_op_fcsel:
1609                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1610                            V3D_QPU_PF_PUSHZ);
1611                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1612                                             src[1], src[2]));
1613                 break;
1614 
1615         case nir_op_frcp:
1616                 result = vir_RECIP(c, src[0]);
1617                 break;
1618         case nir_op_frsq:
1619                 result = vir_RSQRT(c, src[0]);
1620                 break;
1621         case nir_op_fexp2:
1622                 result = vir_EXP(c, src[0]);
1623                 break;
1624         case nir_op_flog2:
1625                 result = vir_LOG(c, src[0]);
1626                 break;
1627 
1628         case nir_op_fceil:
1629                 result = vir_FCEIL(c, src[0]);
1630                 break;
1631         case nir_op_ffloor:
1632                 result = vir_FFLOOR(c, src[0]);
1633                 break;
1634         case nir_op_fround_even:
1635                 result = vir_FROUND(c, src[0]);
1636                 break;
1637         case nir_op_ftrunc:
1638                 result = vir_FTRUNC(c, src[0]);
1639                 break;
1640 
1641         case nir_op_fsin:
1642                 result = ntq_fsincos(c, src[0], false);
1643                 break;
1644         case nir_op_fcos:
1645                 result = ntq_fsincos(c, src[0], true);
1646                 break;
1647 
1648         case nir_op_fsign:
1649                 result = ntq_fsign(c, src[0]);
1650                 break;
1651 
1652         case nir_op_fabs: {
1653                 result = vir_FMOV(c, src[0]);
1654                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1655                 break;
1656         }
1657 
1658         case nir_op_iabs:
1659                 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1660                 break;
1661 
1662         case nir_op_fddx:
1663         case nir_op_fddx_coarse:
1664         case nir_op_fddx_fine:
1665                 result = vir_FDX(c, src[0]);
1666                 break;
1667 
1668         case nir_op_fddy:
1669         case nir_op_fddy_coarse:
1670         case nir_op_fddy_fine:
1671                 result = vir_FDY(c, src[0]);
1672                 break;
1673 
1674         case nir_op_uadd_carry:
1675                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1676                            V3D_QPU_PF_PUSHC);
1677                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1678                 break;
1679 
1680         case nir_op_usub_borrow:
1681                 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
1682                            V3D_QPU_PF_PUSHC);
1683                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1684                 break;
1685 
1686         case nir_op_pack_half_2x16_split:
1687                 result = vir_VFPACK(c, src[0], src[1]);
1688                 break;
1689 
1690         case nir_op_pack_2x32_to_2x16_v3d:
1691                 result = vir_VPACK(c, src[0], src[1]);
1692                 break;
1693 
1694         case nir_op_pack_32_to_r11g11b10_v3d:
1695                 result = vir_V11FPACK(c, src[0], src[1]);
1696                 break;
1697 
1698         case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
1699                 result = vir_V10PACK(c, src[0], src[1]);
1700                 break;
1701 
1702         case nir_op_pack_4x16_to_4x8_v3d:
1703                 result = vir_V8PACK(c, src[0], src[1]);
1704                 break;
1705 
1706         case nir_op_unpack_half_2x16_split_x:
1707                 result = vir_FMOV(c, src[0]);
1708                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1709                 break;
1710 
1711         case nir_op_unpack_half_2x16_split_y:
1712                 result = vir_FMOV(c, src[0]);
1713                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1714                 break;
1715 
1716         case nir_op_pack_2x16_to_unorm_2x8_v3d:
1717                 result = vir_VFTOUNORM8(c, src[0]);
1718                 break;
1719 
1720         case nir_op_pack_2x16_to_snorm_2x8_v3d:
1721                 result = vir_VFTOSNORM8(c, src[0]);
1722                 break;
1723 
1724         case nir_op_pack_2x16_to_unorm_2x10_v3d:
1725                 result = vir_VFTOUNORM10LO(c, src[0]);
1726                 break;
1727 
1728         case nir_op_pack_2x16_to_unorm_10_2_v3d:
1729                 result = vir_VFTOUNORM10HI(c, src[0]);
1730                 break;
1731 
1732         case nir_op_f2unorm_16_v3d:
1733                 result = vir_FTOUNORM16(c, src[0]);
1734                 break;
1735 
1736         case nir_op_f2snorm_16_v3d:
1737                 result = vir_FTOSNORM16(c, src[0]);
1738                 break;
1739 
1740         default:
1741                 fprintf(stderr, "unknown NIR ALU inst: ");
1742                 nir_print_instr(&instr->instr, stderr);
1743                 fprintf(stderr, "\n");
1744                 abort();
1745         }
1746 
1747         ntq_store_def(c, &instr->def, 0, result);
1748 }
1749 
1750 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1751  * specifier.  They come from a register that's preloaded with 0xffffffff
1752  * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
1753  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1754  */
1755 #define TLB_TYPE_F16_COLOR         (3 << 6)
1756 #define TLB_TYPE_I32_COLOR         (1 << 6)
1757 #define TLB_TYPE_F32_COLOR         (0 << 6)
1758 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1759 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1760 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1761 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1762 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1763 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1764 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1765 
1766 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1767  * flag is set.
1768  */
1769 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1770 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1771 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1772 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1773 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1774 
1775 /* Stencil is a single 32-bit write. */
1776 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1777 
1778 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1779 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1780 {
1781         if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1782                 return;
1783 
1784         struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1785         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1786 
1787         nir_variable *var = c->output_color_var[rt];
1788         int num_components = glsl_get_vector_elements(var->type);
1789         uint32_t conf = 0xffffff00;
1790         struct qinst *inst;
1791 
1792         conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1793                                             TLB_SAMPLE_MODE_PER_PIXEL;
1794         conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1795 
1796         if (c->fs_key->swap_color_rb & (1 << rt))
1797                 num_components = MAX2(num_components, 3);
1798         assert(num_components != 0);
1799 
1800         enum glsl_base_type type = glsl_get_base_type(var->type);
1801         bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1802         bool is_32b_tlb_format = is_int_format ||
1803                                  (c->fs_key->f32_color_rb & (1 << rt));
1804 
1805         if (is_int_format) {
1806                 /* The F32 vs I32 distinction was dropped in 4.2. */
1807                 if (c->devinfo->ver < 42)
1808                         conf |= TLB_TYPE_I32_COLOR;
1809                 else
1810                         conf |= TLB_TYPE_F32_COLOR;
1811                 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1812         } else {
1813                 if (c->fs_key->f32_color_rb & (1 << rt)) {
1814                         conf |= TLB_TYPE_F32_COLOR;
1815                         conf |= ((num_components - 1) <<
1816                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
1817                 } else {
1818                         conf |= TLB_TYPE_F16_COLOR;
1819                         conf |= TLB_F16_SWAP_HI_LO;
1820                         if (num_components >= 3)
1821                                 conf |= TLB_VEC_SIZE_4_F16;
1822                         else
1823                                 conf |= TLB_VEC_SIZE_2_F16;
1824                 }
1825         }
1826 
1827         int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1828         for (int i = 0; i < num_samples; i++) {
1829                 struct qreg *color = c->msaa_per_sample_output ?
1830                         &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1831                         &c->outputs[var->data.driver_location * 4];
1832 
1833                 struct qreg r = color[0];
1834                 struct qreg g = color[1];
1835                 struct qreg b = color[2];
1836                 struct qreg a = color[3];
1837 
1838                 if (c->fs_key->swap_color_rb & (1 << rt))  {
1839                         r = color[2];
1840                         b = color[0];
1841                 }
1842 
1843                 if (c->fs_key->sample_alpha_to_one)
1844                         a = vir_uniform_f(c, 1.0);
1845 
1846                 if (is_32b_tlb_format) {
1847                         if (i == 0) {
1848                                 inst = vir_MOV_dest(c, tlbu_reg, r);
1849                                 inst->uniform =
1850                                         vir_get_uniform_index(c,
1851                                                               QUNIFORM_CONSTANT,
1852                                                               conf);
1853                         } else {
1854                                 vir_MOV_dest(c, tlb_reg, r);
1855                         }
1856 
1857                         if (num_components >= 2)
1858                                 vir_MOV_dest(c, tlb_reg, g);
1859                         if (num_components >= 3)
1860                                 vir_MOV_dest(c, tlb_reg, b);
1861                         if (num_components >= 4)
1862                                 vir_MOV_dest(c, tlb_reg, a);
1863                 } else {
1864                         inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1865                         if (conf != ~0 && i == 0) {
1866                                 inst->dst = tlbu_reg;
1867                                 inst->uniform =
1868                                         vir_get_uniform_index(c,
1869                                                               QUNIFORM_CONSTANT,
1870                                                               conf);
1871                         }
1872 
1873                         if (num_components >= 3)
1874                                 vir_VFPACK_dest(c, tlb_reg, b, a);
1875                 }
1876         }
1877 }
1878 
1879 static void
emit_frag_end(struct v3d_compile * c)1880 emit_frag_end(struct v3d_compile *c)
1881 {
1882         if (c->output_sample_mask_index != -1) {
1883                 vir_SETMSF_dest(c, vir_nop_reg(),
1884                                 vir_AND(c,
1885                                         vir_MSF(c),
1886                                         c->outputs[c->output_sample_mask_index]));
1887         }
1888 
1889         bool has_any_tlb_color_write = false;
1890         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1891                 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1892                         has_any_tlb_color_write = true;
1893         }
1894 
1895         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1896                 struct nir_variable *var = c->output_color_var[0];
1897                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1898 
1899                 vir_SETMSF_dest(c, vir_nop_reg(),
1900                                 vir_AND(c,
1901                                         vir_MSF(c),
1902                                         vir_FTOC(c, color[3])));
1903         }
1904 
1905         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1906 
1907         /* If the shader has no non-TLB side effects and doesn't write Z
1908          * we can promote it to enabling early_fragment_tests even
1909          * if the user didn't.
1910          */
1911         if (c->output_position_index == -1 &&
1912             !(c->s->info.num_images || c->s->info.num_ssbos) &&
1913             !c->s->info.fs.uses_discard &&
1914             !c->s->info.fs.uses_demote &&
1915             !c->fs_key->sample_alpha_to_coverage &&
1916             c->output_sample_mask_index == -1 &&
1917             has_any_tlb_color_write) {
1918                 c->s->info.fs.early_fragment_tests = true;
1919         }
1920 
1921         /* By default, Z buffer writes are implicit using the Z values produced
1922          * from FEP (Z value produced from rasterization). When this is not
1923          * desirable (shader writes Z explicitly, has discards, etc) we need
1924          * to let the hardware know by setting c->writes_z to true, in which
1925          * case we always need to write a Z value from the QPU, even if it is
1926          * just the passthrough Z value produced from FEP.
1927          *
1928          * Also, from the V3D 4.2 spec:
1929          *
1930          * "If a shader performs a Z read the “Fragment shader does Z writes”
1931          *  bit in the shader record must be enabled to ensure deterministic
1932          *  results"
1933          *
1934          * So if c->reads_z is set we always need to write Z, even if it is
1935          * a passthrough from the Z value produced from FEP.
1936          */
1937         if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
1938                 c->writes_z = true;
1939                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1940                 struct qinst *inst;
1941 
1942                 if (c->output_position_index != -1) {
1943                         /* Shader writes to gl_FragDepth, use that */
1944                         inst = vir_MOV_dest(c, tlbu_reg,
1945                                             c->outputs[c->output_position_index]);
1946 
1947                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1948                                           TLB_SAMPLE_MODE_PER_PIXEL);
1949                 } else {
1950                         /* Shader doesn't write to gl_FragDepth, take Z from
1951                          * FEP.
1952                          */
1953                         c->writes_z_from_fep = true;
1954                         inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
1955 
1956                         /* The spec says the PER_PIXEL flag is ignored for
1957                          * invariant writes, but the simulator demands it.
1958                          */
1959                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1960                                           TLB_SAMPLE_MODE_PER_PIXEL);
1961 
1962                         /* Since (single-threaded) fragment shaders always need
1963                          * a TLB write, if we dond't have any we emit a
1964                          * passthrouh Z and flag us as potentially discarding,
1965                          * so that we can use Z as the required TLB write.
1966                          */
1967                         if (!has_any_tlb_color_write)
1968                                 c->s->info.fs.uses_discard = true;
1969                 }
1970 
1971                 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1972                                                       tlb_specifier |
1973                                                       0xffffff00);
1974                 inst->is_tlb_z_write = true;
1975         }
1976 
1977         /* XXX: Performance improvement: Merge Z write and color writes TLB
1978          * uniform setup
1979          */
1980         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
1981                 vir_emit_tlb_color_write(c, rt);
1982 }
1983 
1984 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)1985 vir_VPM_WRITE_indirect(struct v3d_compile *c,
1986                        struct qreg val,
1987                        struct qreg vpm_index,
1988                        bool uniform_vpm_index)
1989 {
1990         if (uniform_vpm_index)
1991                 vir_STVPMV(c, vpm_index, val);
1992         else
1993                 vir_STVPMD(c, vpm_index, val);
1994 }
1995 
1996 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)1997 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
1998 {
1999         vir_VPM_WRITE_indirect(c, val,
2000                                vir_uniform_ui(c, vpm_index), true);
2001 }
2002 
2003 static void
emit_vert_end(struct v3d_compile * c)2004 emit_vert_end(struct v3d_compile *c)
2005 {
2006         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2007          */
2008         if (c->devinfo->ver == 42)
2009                 vir_VPMWT(c);
2010 }
2011 
2012 static void
emit_geom_end(struct v3d_compile * c)2013 emit_geom_end(struct v3d_compile *c)
2014 {
2015         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2016          */
2017         if (c->devinfo->ver == 42)
2018                 vir_VPMWT(c);
2019 }
2020 
2021 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2022 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2023                        unsigned bit_size,
2024                        unsigned num_components,
2025                        nir_intrinsic_instr *low,
2026                        nir_intrinsic_instr *high,
2027                        void *data)
2028 {
2029         /* TMU general access only supports 32-bit vectors */
2030         if (bit_size > 32)
2031                 return false;
2032 
2033         if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2034                 return false;
2035 
2036         if (align_mul % 4 != 0 || align_offset % 4 != 0)
2037                 return false;
2038 
2039         /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2040          * if the resulting vector crosses a 16-byte boundary.
2041          */
2042         assert(util_is_power_of_two_nonzero(align_mul));
2043         align_mul = MIN2(align_mul, 16);
2044         align_offset &= 0xf;
2045         if (16 - align_mul + align_offset + num_components * 4 > 16)
2046                 return false;
2047 
2048         return true;
2049 }
2050 
2051 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2052 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2053 {
2054         bool progress;
2055         unsigned lower_flrp =
2056                 (s->options->lower_flrp16 ? 16 : 0) |
2057                 (s->options->lower_flrp32 ? 32 : 0) |
2058                 (s->options->lower_flrp64 ? 64 : 0);
2059 
2060         do {
2061                 progress = false;
2062 
2063                 NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
2064                 NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
2065                 NIR_PASS(progress, s, nir_opt_deref);
2066 
2067                 NIR_PASS(progress, s, nir_lower_vars_to_ssa);
2068                 if (!s->info.var_copies_lowered) {
2069                         /* Only run this pass if nir_lower_var_copies was not called
2070                          * yet. That would lower away any copy_deref instructions and we
2071                          * don't want to introduce any more.
2072                          */
2073                         NIR_PASS(progress, s, nir_opt_find_array_copies);
2074                 }
2075 
2076                 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
2077                 NIR_PASS(progress, s, nir_opt_dead_write_vars);
2078                 NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
2079 
2080                 NIR_PASS(progress, s, nir_remove_dead_variables,
2081                          (nir_variable_mode)(nir_var_function_temp |
2082                                              nir_var_shader_temp |
2083                                              nir_var_mem_shared),
2084                          NULL);
2085 
2086                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2087                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2088                 NIR_PASS(progress, s, nir_copy_prop);
2089                 NIR_PASS(progress, s, nir_opt_remove_phis);
2090                 NIR_PASS(progress, s, nir_opt_dce);
2091                 NIR_PASS(progress, s, nir_opt_dead_cf);
2092                 NIR_PASS(progress, s, nir_opt_cse);
2093                 NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
2094                 NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
2095                 NIR_PASS(progress, s, nir_opt_algebraic);
2096                 NIR_PASS(progress, s, nir_opt_constant_folding);
2097 
2098                 NIR_PASS(progress, s, nir_opt_intrinsics);
2099                 NIR_PASS(progress, s, nir_opt_idiv_const, 32);
2100                 NIR_PASS(progress, s, nir_lower_alu);
2101 
2102                 if (nir_opt_loop(s)) {
2103                    progress = true;
2104                    NIR_PASS(progress, s, nir_copy_prop);
2105                    NIR_PASS(progress, s, nir_opt_dce);
2106                 }
2107 
2108                 NIR_PASS(progress, s, nir_opt_conditional_discard);
2109 
2110                 NIR_PASS(progress, s, nir_opt_remove_phis);
2111                 NIR_PASS(progress, s, nir_opt_if, false);
2112                 if (c && !c->disable_gcm) {
2113                         bool local_progress = false;
2114                         NIR_PASS(local_progress, s, nir_opt_gcm, false);
2115                         c->gcm_progress |= local_progress;
2116                         progress |= local_progress;
2117                 }
2118 
2119                 /* Note that vectorization may undo the load/store scalarization
2120                  * pass we run for non 32-bit TMU general load/store by
2121                  * converting, for example, 2 consecutive 16-bit loads into a
2122                  * single 32-bit load. This is fine (and desirable) as long as
2123                  * the resulting 32-bit load meets 32-bit alignment requirements,
2124                  * which mem_vectorize_callback() should be enforcing.
2125                  */
2126                 nir_load_store_vectorize_options vectorize_opts = {
2127                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2128                                  nir_var_mem_push_const | nir_var_mem_shared |
2129                                  nir_var_mem_global,
2130                         .callback = mem_vectorize_callback,
2131                         .robust_modes = 0,
2132                 };
2133                 bool vectorize_progress = false;
2134 
2135 
2136                 /* This requires that we have called
2137                  * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
2138                  * first, which we may not have done yet if we call here too
2139                  * early durign NIR pre-processing. We can detect this because
2140                  * in that case we won't have a compile object
2141                  */
2142                 if (c) {
2143                         NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2144                                  &vectorize_opts);
2145                         if (vectorize_progress) {
2146                                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2147                                 NIR_PASS(progress, s, nir_lower_pack);
2148                                 progress = true;
2149                         }
2150                 }
2151 
2152                 if (lower_flrp != 0) {
2153                         bool lower_flrp_progress = false;
2154 
2155                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2156                                  lower_flrp,
2157                                  false /* always_precise */);
2158                         if (lower_flrp_progress) {
2159                                 NIR_PASS(progress, s, nir_opt_constant_folding);
2160                                 progress = true;
2161                         }
2162 
2163                         /* Nothing should rematerialize any flrps, so we only
2164                          * need to do this lowering once.
2165                          */
2166                         lower_flrp = 0;
2167                 }
2168 
2169                 NIR_PASS(progress, s, nir_opt_undef);
2170                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2171 
2172                 if (c && !c->disable_loop_unrolling &&
2173                     s->options->max_unroll_iterations > 0) {
2174                        bool local_progress = false;
2175                        NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2176                        c->unrolled_any_loops |= local_progress;
2177                        progress |= local_progress;
2178                 }
2179         } while (progress);
2180 
2181         nir_move_options sink_opts =
2182                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2183                 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2184         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2185 }
2186 
2187 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2188 driver_location_compare(const nir_variable *a, const nir_variable *b)
2189 {
2190         return a->data.driver_location == b->data.driver_location ?
2191                a->data.location_frac - b->data.location_frac :
2192                a->data.driver_location - b->data.driver_location;
2193 }
2194 
2195 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t * num_components_queued,uint32_t * remaining,uint32_t vpm_index)2196 ntq_emit_vpm_read(struct v3d_compile *c,
2197                   uint32_t *num_components_queued,
2198                   uint32_t *remaining,
2199                   uint32_t vpm_index)
2200 {
2201         return vir_LDVPMV_IN(c,
2202                              vir_uniform_ui(c,
2203                                             (*num_components_queued)++));
2204 }
2205 
2206 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2207 ntq_setup_vs_inputs(struct v3d_compile *c)
2208 {
2209         /* Figure out how many components of each vertex attribute the shader
2210          * uses.  Each variable should have been split to individual
2211          * components and unused ones DCEed.  The vertex fetcher will load
2212          * from the start of the attribute to the number of components we
2213          * declare we need in c->vattr_sizes[].
2214          *
2215          * BGRA vertex attributes are a bit special: since we implement these
2216          * as RGBA swapping R/B components we always need at least 3 components
2217          * if component 0 is read.
2218          */
2219         nir_foreach_shader_in_variable(var, c->s) {
2220                 /* No VS attribute array support. */
2221                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2222 
2223                 unsigned loc = var->data.driver_location;
2224                 int start_component = var->data.location_frac;
2225                 int num_components = glsl_get_components(var->type);
2226 
2227                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2228                                            start_component + num_components);
2229 
2230                 /* Handle BGRA inputs */
2231                 if (start_component == 0 &&
2232                     c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2233                         c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2234                 }
2235         }
2236 
2237         unsigned num_components = 0;
2238         uint32_t vpm_components_queued = 0;
2239         bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2240                                     SYSTEM_VALUE_INSTANCE_ID) ||
2241                         BITSET_TEST(c->s->info.system_values_read,
2242                                     SYSTEM_VALUE_INSTANCE_INDEX);
2243         bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2244                                      SYSTEM_VALUE_BASE_INSTANCE);
2245         bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2246                                     SYSTEM_VALUE_VERTEX_ID) ||
2247                         BITSET_TEST(c->s->info.system_values_read,
2248                                     SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2249 
2250         num_components += uses_iid;
2251         num_components += uses_biid;
2252         num_components += uses_vid;
2253 
2254         for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
2255                 num_components += c->vattr_sizes[i];
2256 
2257         if (uses_iid) {
2258                 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
2259                                            &num_components, ~0);
2260         }
2261 
2262         if (uses_biid) {
2263                 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
2264                                             &num_components, ~0);
2265         }
2266 
2267         if (uses_vid) {
2268                 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
2269                                            &num_components, ~0);
2270         }
2271 
2272         /* The actual loads will happen directly in nir_intrinsic_load_input
2273          */
2274         return;
2275 }
2276 
2277 static bool
program_reads_point_coord(struct v3d_compile * c)2278 program_reads_point_coord(struct v3d_compile *c)
2279 {
2280         nir_foreach_shader_in_variable(var, c->s) {
2281                 if (util_varying_is_point_coord(var->data.location,
2282                                                 c->fs_key->point_sprite_mask)) {
2283                         return true;
2284                 }
2285         }
2286 
2287         return false;
2288 }
2289 
2290 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2291 ntq_setup_gs_inputs(struct v3d_compile *c)
2292 {
2293         nir_sort_variables_with_modes(c->s, driver_location_compare,
2294                                       nir_var_shader_in);
2295 
2296         nir_foreach_shader_in_variable(var, c->s) {
2297                 /* All GS inputs are arrays with as many entries as vertices
2298                  * in the input primitive, but here we only care about the
2299                  * per-vertex input type.
2300                  */
2301                 assert(glsl_type_is_array(var->type));
2302                 const struct glsl_type *type = glsl_get_array_element(var->type);
2303                 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2304                 unsigned loc = var->data.driver_location;
2305 
2306                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2307                                   (loc + var_len) * 4);
2308 
2309                 if (var->data.compact) {
2310                         for (unsigned j = 0; j < var_len; j++) {
2311                                 unsigned input_idx = c->num_inputs++;
2312                                 unsigned loc_frac = var->data.location_frac + j;
2313                                 unsigned loc = var->data.location + loc_frac / 4;
2314                                 unsigned comp = loc_frac % 4;
2315                                 c->input_slots[input_idx] =
2316                                         v3d_slot_from_slot_and_component(loc, comp);
2317                         }
2318                        continue;
2319                 }
2320 
2321                 for (unsigned j = 0; j < var_len; j++) {
2322                         unsigned num_elements =
2323                                 glsl_type_is_struct(glsl_without_array(type)) ?
2324                                 4 : glsl_get_vector_elements(type);
2325                         for (unsigned k = 0; k < num_elements; k++) {
2326                                 unsigned chan = var->data.location_frac + k;
2327                                 unsigned input_idx = c->num_inputs++;
2328                                 struct v3d_varying_slot slot =
2329                                         v3d_slot_from_slot_and_component(var->data.location + j, chan);
2330                                 c->input_slots[input_idx] = slot;
2331                         }
2332                 }
2333         }
2334 }
2335 
2336 
2337 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2338 ntq_setup_fs_inputs(struct v3d_compile *c)
2339 {
2340         nir_sort_variables_with_modes(c->s, driver_location_compare,
2341                                       nir_var_shader_in);
2342 
2343         nir_foreach_shader_in_variable(var, c->s) {
2344                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2345                 unsigned loc = var->data.driver_location;
2346 
2347                 uint32_t inputs_array_size = c->inputs_array_size;
2348                 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2349                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2350                                   inputs_array_required_size);
2351                 resize_interp_array(c, &c->interp, &inputs_array_size,
2352                                     inputs_array_required_size);
2353 
2354                 if (var->data.location == VARYING_SLOT_POS) {
2355                         emit_fragcoord_input(c, loc);
2356                 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2357                            !c->fs_key->has_gs) {
2358                         /* If the fragment shader reads gl_PrimitiveID and we
2359                          * don't have a geometry shader in the pipeline to write
2360                          * it then we program the hardware to inject it as
2361                          * an implicit varying. Take it from there.
2362                          */
2363                         c->inputs[loc * 4] = c->primitive_id;
2364                 } else if (util_varying_is_point_coord(var->data.location,
2365                                                        c->fs_key->point_sprite_mask)) {
2366                         c->inputs[loc * 4 + 0] = c->point_x;
2367                         c->inputs[loc * 4 + 1] = c->point_y;
2368                 } else if (var->data.compact) {
2369                         for (int j = 0; j < var_len; j++)
2370                                 emit_compact_fragment_input(c, loc, var, j);
2371                 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2372                         for (int j = 0; j < var_len; j++) {
2373                            emit_fragment_input(c, loc, var, j, 4);
2374                         }
2375                 } else {
2376                         for (int j = 0; j < var_len; j++) {
2377                                 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2378                         }
2379                 }
2380         }
2381 }
2382 
2383 static void
ntq_setup_outputs(struct v3d_compile * c)2384 ntq_setup_outputs(struct v3d_compile *c)
2385 {
2386         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2387                 return;
2388 
2389         nir_foreach_shader_out_variable(var, c->s) {
2390                 assert(glsl_type_is_vector_or_scalar(var->type));
2391                 unsigned loc = var->data.driver_location * 4;
2392 
2393                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2394                         add_output(c, loc + var->data.location_frac + i,
2395                                    var->data.location,
2396                                    var->data.location_frac + i);
2397                 }
2398 
2399                 switch (var->data.location) {
2400                 case FRAG_RESULT_COLOR:
2401                         for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
2402                                 c->output_color_var[i] = var;
2403                         break;
2404                 case FRAG_RESULT_DATA0:
2405                 case FRAG_RESULT_DATA1:
2406                 case FRAG_RESULT_DATA2:
2407                 case FRAG_RESULT_DATA3:
2408                 case FRAG_RESULT_DATA4:
2409                 case FRAG_RESULT_DATA5:
2410                 case FRAG_RESULT_DATA6:
2411                 case FRAG_RESULT_DATA7:
2412                         c->output_color_var[var->data.location -
2413                                             FRAG_RESULT_DATA0] = var;
2414                         break;
2415                 case FRAG_RESULT_DEPTH:
2416                         c->output_position_index = loc;
2417                         break;
2418                 case FRAG_RESULT_SAMPLE_MASK:
2419                         c->output_sample_mask_index = loc;
2420                         break;
2421                 }
2422         }
2423 }
2424 
2425 /**
2426  * Sets up the mapping from nir_register to struct qreg *.
2427  *
2428  * Each nir_register gets a struct qreg per 32-bit component being stored.
2429  */
2430 static void
ntq_setup_registers(struct v3d_compile * c,nir_function_impl * impl)2431 ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
2432 {
2433         nir_foreach_reg_decl(decl, impl) {
2434                 unsigned num_components = nir_intrinsic_num_components(decl);
2435                 unsigned array_len = nir_intrinsic_num_array_elems(decl);
2436                 array_len = MAX2(array_len, 1);
2437                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2438                                                   array_len * num_components);
2439 
2440                 nir_def *nir_reg = &decl->def;
2441                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2442 
2443                 for (int i = 0; i < array_len * num_components; i++)
2444                         qregs[i] = vir_get_temp(c);
2445         }
2446 }
2447 
2448 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2449 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2450 {
2451         /* XXX perf: Experiment with using immediate loads to avoid having
2452          * these end up in the uniform stream.  Watch out for breaking the
2453          * small immediates optimization in the process!
2454          */
2455         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2456         for (int i = 0; i < instr->def.num_components; i++)
2457                 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2458 
2459         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2460 }
2461 
2462 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2463 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2464 {
2465         unsigned image_index = nir_src_as_uint(instr->src[0]);
2466         bool is_array = nir_intrinsic_image_array(instr);
2467 
2468         assert(nir_src_as_uint(instr->src[1]) == 0);
2469 
2470         ntq_store_def(c, &instr->def, 0,
2471                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2472         if (instr->num_components > 1) {
2473                 ntq_store_def(c, &instr->def, 1,
2474                               vir_uniform(c,
2475                                           instr->num_components == 2 && is_array ?
2476                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
2477                                                   QUNIFORM_IMAGE_HEIGHT,
2478                                           image_index));
2479         }
2480         if (instr->num_components > 2) {
2481                 ntq_store_def(c, &instr->def, 2,
2482                               vir_uniform(c,
2483                                           is_array ?
2484                                           QUNIFORM_IMAGE_ARRAY_SIZE :
2485                                           QUNIFORM_IMAGE_DEPTH,
2486                                           image_index));
2487         }
2488 }
2489 
2490 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2491 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2492 {
2493         assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2494 
2495         int rt = nir_src_as_uint(instr->src[0]);
2496         assert(rt < V3D_MAX_DRAW_BUFFERS);
2497 
2498         int sample_index = nir_intrinsic_base(instr) ;
2499         assert(sample_index < V3D_MAX_SAMPLES);
2500 
2501         int component = nir_intrinsic_component(instr);
2502         assert(component < 4);
2503 
2504         /* We need to emit our TLB reads after we have acquired the scoreboard
2505          * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2506          * the last thread switch to improve parallelism, however, that is only
2507          * guaranteed to happen before the tlb color writes.
2508          *
2509          * To fix that, we make sure we always emit a thread switch before the
2510          * first tlb color read. If that happens to be the last thread switch
2511          * we emit, then everything is fine, but otherwise, if any code after
2512          * this point needs to emit additional thread switches, then we will
2513          * switch the strategy to locking the scoreboard on the first thread
2514          * switch instead -- see vir_emit_thrsw().
2515          */
2516         if (!c->emitted_tlb_load) {
2517                 if (!c->last_thrsw_at_top_level)
2518                         vir_emit_thrsw(c);
2519 
2520                 c->emitted_tlb_load = true;
2521         }
2522 
2523         struct qreg *color_reads_for_sample =
2524                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2525 
2526         if (color_reads_for_sample[component].file == QFILE_NULL) {
2527                 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2528                 int num_components =
2529                         util_format_get_nr_components(rt_format);
2530 
2531                 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2532                 if (swap_rb)
2533                         num_components = MAX2(num_components, 3);
2534 
2535                 nir_variable *var = c->output_color_var[rt];
2536                 enum glsl_base_type type = glsl_get_base_type(var->type);
2537 
2538                 bool is_int_format = type == GLSL_TYPE_INT ||
2539                                      type == GLSL_TYPE_UINT;
2540 
2541                 bool is_32b_tlb_format = is_int_format ||
2542                                          (c->fs_key->f32_color_rb & (1 << rt));
2543 
2544                 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2545 
2546                 uint32_t conf = 0xffffff00;
2547                 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2548                                           TLB_SAMPLE_MODE_PER_PIXEL;
2549                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2550 
2551                 if (is_32b_tlb_format) {
2552                         /* The F32 vs I32 distinction was dropped in 4.2. */
2553                         conf |= (c->devinfo->ver < 42 && is_int_format) ?
2554                                 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2555 
2556                         conf |= ((num_components - 1) <<
2557                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
2558                 } else {
2559                         conf |= TLB_TYPE_F16_COLOR;
2560                         conf |= TLB_F16_SWAP_HI_LO;
2561 
2562                         if (num_components >= 3)
2563                                 conf |= TLB_VEC_SIZE_4_F16;
2564                         else
2565                                 conf |= TLB_VEC_SIZE_2_F16;
2566                 }
2567 
2568 
2569                 for (int i = 0; i < num_samples; i++) {
2570                         struct qreg r, g, b, a;
2571                         if (is_32b_tlb_format) {
2572                                 r = conf != 0xffffffff && i == 0?
2573                                         vir_TLBU_COLOR_READ(c, conf) :
2574                                         vir_TLB_COLOR_READ(c);
2575                                 if (num_components >= 2)
2576                                         g = vir_TLB_COLOR_READ(c);
2577                                 if (num_components >= 3)
2578                                         b = vir_TLB_COLOR_READ(c);
2579                                 if (num_components >= 4)
2580                                         a = vir_TLB_COLOR_READ(c);
2581                         } else {
2582                                 struct qreg rg = conf != 0xffffffff && i == 0 ?
2583                                         vir_TLBU_COLOR_READ(c, conf) :
2584                                         vir_TLB_COLOR_READ(c);
2585                                 r = vir_FMOV(c, rg);
2586                                 vir_set_unpack(c->defs[r.index], 0,
2587                                                V3D_QPU_UNPACK_L);
2588                                 g = vir_FMOV(c, rg);
2589                                 vir_set_unpack(c->defs[g.index], 0,
2590                                                V3D_QPU_UNPACK_H);
2591 
2592                                 if (num_components > 2) {
2593                                     struct qreg ba = vir_TLB_COLOR_READ(c);
2594                                     b = vir_FMOV(c, ba);
2595                                     vir_set_unpack(c->defs[b.index], 0,
2596                                                    V3D_QPU_UNPACK_L);
2597                                     a = vir_FMOV(c, ba);
2598                                     vir_set_unpack(c->defs[a.index], 0,
2599                                                    V3D_QPU_UNPACK_H);
2600                                 }
2601                         }
2602 
2603                         struct qreg *color_reads =
2604                                 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2605 
2606                         color_reads[0] = swap_rb ? b : r;
2607                         if (num_components >= 2)
2608                                 color_reads[1] = g;
2609                         if (num_components >= 3)
2610                                 color_reads[2] = swap_rb ? r : b;
2611                         if (num_components >= 4)
2612                                 color_reads[3] = a;
2613                 }
2614         }
2615 
2616         assert(color_reads_for_sample[component].file != QFILE_NULL);
2617         ntq_store_def(c, &instr->def, 0,
2618                       vir_MOV(c, color_reads_for_sample[component]));
2619 }
2620 
2621 static bool
2622 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2623 
2624 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_def * def,enum quniform_contents contents)2625 try_emit_uniform(struct v3d_compile *c,
2626                  int offset,
2627                  int num_components,
2628                  nir_def *def,
2629                  enum quniform_contents contents)
2630 {
2631         /* Even though ldunif is strictly 32-bit we can still use it
2632          * to load scalar 8-bit/16-bit uniforms so long as their offset
2633          * is 32-bit aligned. In this case, ldunif would still load
2634          * 32-bit into the destination with the 8-bit/16-bit uniform
2635          * data in the LSB and garbage in the MSB, but that is fine
2636          * because we should only be accessing the valid bits of the
2637          * destination.
2638          *
2639          * FIXME: if in the future we improve our register allocator to
2640          * pack 2 16-bit variables in the MSB and LSB of the same
2641          * register then this optimization would not be valid as is,
2642          * since the load clobbers the MSB.
2643          */
2644         if (offset % 4 != 0)
2645                 return false;
2646 
2647         /* We need dwords */
2648         offset = offset / 4;
2649 
2650         for (int i = 0; i < num_components; i++) {
2651                 ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
2652         }
2653 
2654         return true;
2655 }
2656 
2657 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2658 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2659 {
2660         /* We scalarize general TMU access for anything that is not 32-bit. */
2661         assert(instr->def.bit_size == 32 ||
2662                instr->num_components == 1);
2663 
2664         /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2665         if (nir_src_is_const(instr->src[0])) {
2666                 int offset = (nir_intrinsic_base(instr) +
2667                              nir_src_as_uint(instr->src[0]));
2668 
2669                 if (try_emit_uniform(c, offset, instr->num_components,
2670                                      &instr->def, QUNIFORM_UNIFORM)) {
2671                         return;
2672                 }
2673         }
2674 
2675         if (!ntq_emit_load_unifa(c, instr)) {
2676                 ntq_emit_tmu_general(c, instr, false, false);
2677                 c->has_general_tmu_load = true;
2678         }
2679 }
2680 
2681 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2682 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2683 {
2684         if (c->compiler->max_inline_uniform_buffers <= 0)
2685                 return false;
2686 
2687         /* Regular UBOs start after inline UBOs */
2688         uint32_t index = nir_src_as_uint(instr->src[0]);
2689         if (index >= c->compiler->max_inline_uniform_buffers)
2690                 return false;
2691 
2692         /* We scalarize general TMU access for anything that is not 32-bit */
2693         assert(instr->def.bit_size == 32 ||
2694                instr->num_components == 1);
2695 
2696         if (nir_src_is_const(instr->src[1])) {
2697                 int offset = nir_src_as_uint(instr->src[1]);
2698                 if (try_emit_uniform(c, offset, instr->num_components,
2699                                      &instr->def,
2700                                      QUNIFORM_INLINE_UBO_0 + index)) {
2701                         return true;
2702                 }
2703         }
2704 
2705         /* Fallback to regular UBO load */
2706         return false;
2707 }
2708 
2709 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2710 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2711 {
2712         /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2713          *
2714          * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2715          * if we don't support non-uniform offsets because we also set the
2716          * lower_all_io_to_temps option in the NIR compiler. This ensures that
2717          * any indirect indexing on in/out variables is turned into indirect
2718          * indexing on temporary variables instead, that we handle by lowering
2719          * to scratch. If we implement non-uniform offset here we might be able
2720          * to avoid the temp and scratch lowering, which involves copying from
2721          * the input to the temp variable, possibly making code more optimal.
2722          */
2723         unsigned offset =
2724                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2725 
2726         if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2727                /* Emit the LDVPM directly now, rather than at the top
2728                 * of the shader like we did for V3D 3.x (which needs
2729                 * vpmsetup when not just taking the next offset).
2730                 *
2731                 * Note that delaying like this may introduce stalls,
2732                 * as LDVPMV takes a minimum of 1 instruction but may
2733                 * be slower if the VPM unit is busy with another QPU.
2734                 */
2735                int index = 0;
2736                if (BITSET_TEST(c->s->info.system_values_read,
2737                                SYSTEM_VALUE_INSTANCE_ID)) {
2738                       index++;
2739                }
2740                if (BITSET_TEST(c->s->info.system_values_read,
2741                                SYSTEM_VALUE_BASE_INSTANCE)) {
2742                       index++;
2743                }
2744                if (BITSET_TEST(c->s->info.system_values_read,
2745                                SYSTEM_VALUE_VERTEX_ID)) {
2746                       index++;
2747                }
2748                for (int i = 0; i < offset; i++)
2749                       index += c->vattr_sizes[i];
2750                index += nir_intrinsic_component(instr);
2751                for (int i = 0; i < instr->num_components; i++) {
2752                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
2753                       ntq_store_def(c, &instr->def, i,
2754                                     vir_LDVPMV_IN(c, vpm_offset));
2755                 }
2756         } else {
2757                 for (int i = 0; i < instr->num_components; i++) {
2758                         int comp = nir_intrinsic_component(instr) + i;
2759                         struct qreg input = c->inputs[offset * 4 + comp];
2760                         ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
2761 
2762                         if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2763                             input.file == c->payload_z.file &&
2764                             input.index == c->payload_z.index) {
2765                                 c->reads_z = true;
2766                         }
2767                 }
2768         }
2769 }
2770 
2771 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2772 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2773                                 nir_intrinsic_instr *instr)
2774 {
2775         assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2776 
2777         unsigned rt = nir_src_as_uint(instr->src[1]);
2778         assert(rt < V3D_MAX_DRAW_BUFFERS);
2779 
2780         unsigned sample_idx = nir_intrinsic_base(instr);
2781         assert(sample_idx < V3D_MAX_SAMPLES);
2782 
2783         unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2784         for (int i = 0; i < instr->num_components; i++) {
2785                 c->sample_colors[offset + i] =
2786                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2787         }
2788 }
2789 
2790 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2791 ntq_emit_color_write(struct v3d_compile *c,
2792                      nir_intrinsic_instr *instr)
2793 {
2794         unsigned offset = (nir_intrinsic_base(instr) +
2795                            nir_src_as_uint(instr->src[1])) * 4 +
2796                           nir_intrinsic_component(instr);
2797         for (int i = 0; i < instr->num_components; i++) {
2798                 c->outputs[offset + i] =
2799                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2800         }
2801 }
2802 
2803 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2804 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2805 {
2806         assert(instr->num_components == 1);
2807 
2808         struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2809 
2810         uint32_t base_offset = nir_intrinsic_base(instr);
2811 
2812         if (base_offset)
2813                 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2814 
2815         /* Usually, for VS or FS, we only emit outputs once at program end so
2816          * our VPM writes are never in non-uniform control flow, but this
2817          * is not true for GS, where we are emitting multiple vertices.
2818          */
2819         if (vir_in_nonuniform_control_flow(c)) {
2820                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2821                            V3D_QPU_PF_PUSHZ);
2822         }
2823 
2824         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2825 
2826         /* The offset isn’t necessarily dynamically uniform for a geometry
2827          * shader. This can happen if the shader sometimes doesn’t emit one of
2828          * the vertices. In that case subsequent vertices will be written to
2829          * different offsets in the VPM and we need to use the scatter write
2830          * instruction to have a different offset for each lane.
2831          */
2832          bool is_uniform_offset =
2833                  !vir_in_nonuniform_control_flow(c) &&
2834                  !nir_src_is_divergent(instr->src[1]);
2835          vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2836 
2837         if (vir_in_nonuniform_control_flow(c)) {
2838                 struct qinst *last_inst =
2839                         (struct qinst *)c->cur_block->instructions.prev;
2840                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2841         }
2842 }
2843 
2844 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2845 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2846 {
2847         assert(c->s->info.stage == MESA_SHADER_VERTEX);
2848         assert(instr->num_components == 1);
2849 
2850         uint32_t base = nir_intrinsic_base(instr);
2851         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2852 
2853         if (nir_src_is_const(instr->src[1])) {
2854                 vir_VPM_WRITE(c, val,
2855                               base + nir_src_as_uint(instr->src[1]));
2856         } else {
2857                 struct qreg offset = vir_ADD(c,
2858                                              ntq_get_src(c, instr->src[1], 1),
2859                                              vir_uniform_ui(c, base));
2860                 bool is_uniform_offset =
2861                         !vir_in_nonuniform_control_flow(c) &&
2862                         !nir_src_is_divergent(instr->src[1]);
2863                 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2864         }
2865 }
2866 
2867 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2868 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2869 {
2870         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2871                ntq_emit_color_write(c, instr);
2872         else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2873                emit_store_output_gs(c, instr);
2874         else
2875                emit_store_output_vs(c, instr);
2876 }
2877 
2878 /**
2879  * This implementation is based on v3d_sample_{x,y}_offset() from
2880  * v3d_sample_offset.h.
2881  */
2882 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2883 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2884                       struct qreg *sx, struct qreg *sy)
2885 {
2886         sample_idx = vir_ITOF(c, sample_idx);
2887 
2888         struct qreg offset_x =
2889                 vir_FADD(c, vir_uniform_f(c, -0.125f),
2890                             vir_FMUL(c, sample_idx,
2891                                         vir_uniform_f(c, 0.5f)));
2892         vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2893                                     vir_uniform_f(c, 2.0f), sample_idx),
2894                    V3D_QPU_PF_PUSHC);
2895         offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2896                               vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2897                               offset_x);
2898 
2899         struct qreg offset_y =
2900                    vir_FADD(c, vir_uniform_f(c, -0.375f),
2901                                vir_FMUL(c, sample_idx,
2902                                            vir_uniform_f(c, 0.25f)));
2903         *sx = offset_x;
2904         *sy = offset_y;
2905 }
2906 
2907 /**
2908  * This implementation is based on get_centroid_offset() from fep.c.
2909  */
2910 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2911 ntq_get_barycentric_centroid(struct v3d_compile *c,
2912                              struct qreg *out_x,
2913                              struct qreg *out_y)
2914 {
2915         struct qreg sample_mask;
2916         if (c->output_sample_mask_index != -1)
2917                 sample_mask = c->outputs[c->output_sample_mask_index];
2918         else
2919                 sample_mask = vir_MSF(c);
2920 
2921         struct qreg i0 = vir_uniform_ui(c, 0);
2922         struct qreg i1 = vir_uniform_ui(c, 1);
2923         struct qreg i2 = vir_uniform_ui(c, 2);
2924         struct qreg i3 = vir_uniform_ui(c, 3);
2925         struct qreg i4 = vir_uniform_ui(c, 4);
2926         struct qreg i8 = vir_uniform_ui(c, 8);
2927 
2928         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2929         struct qreg F = vir_uniform_ui(c, 0);
2930         struct qreg T = vir_uniform_ui(c, ~0);
2931         struct qreg s0 = vir_AND(c, sample_mask, i1);
2932         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2933         s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2934         struct qreg s1 = vir_AND(c, sample_mask, i2);
2935         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2936         s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2937         struct qreg s2 = vir_AND(c, sample_mask, i4);
2938         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2939         s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2940         struct qreg s3 = vir_AND(c, sample_mask, i8);
2941         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2942         s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2943 
2944         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2945         struct qreg sample_idx = i3;
2946         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2947         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2948         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2949         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2950         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2951         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2952 
2953         /* Get offset at selected sample index */
2954         struct qreg offset_x, offset_y;
2955         ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2956 
2957         /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2958          * are selected.
2959          */
2960         struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2961         struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2962 
2963         struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2964         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2965         use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2966         use_center = vir_OR(c, use_center, s0_and_s3);
2967         use_center = vir_OR(c, use_center, s1_and_s2);
2968 
2969         struct qreg zero = vir_uniform_f(c, 0.0f);
2970         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2971         offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
2972         offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
2973 
2974         *out_x = offset_x;
2975         *out_y = offset_y;
2976 }
2977 
2978 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)2979 ntq_emit_load_interpolated_input(struct v3d_compile *c,
2980                                  struct qreg p,
2981                                  struct qreg C,
2982                                  struct qreg offset_x,
2983                                  struct qreg offset_y,
2984                                  unsigned mode)
2985 {
2986         if (mode == INTERP_MODE_FLAT)
2987                 return C;
2988 
2989         struct qreg sample_offset_x =
2990                 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2991         struct qreg sample_offset_y =
2992                 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2993 
2994         struct qreg scaleX =
2995                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
2996                             offset_x);
2997         struct qreg scaleY =
2998                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
2999                             offset_y);
3000 
3001         struct qreg pInterp =
3002                 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3003                                            vir_FMUL(c, vir_FDY(c, p), scaleY)));
3004 
3005         if (mode == INTERP_MODE_NOPERSPECTIVE)
3006                 return vir_FADD(c, pInterp, C);
3007 
3008         struct qreg w = c->payload_w;
3009         struct qreg wInterp =
3010                 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3011                                            vir_FMUL(c, vir_FDY(c, w), scaleY)));
3012 
3013         return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3014 }
3015 
3016 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3017 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3018 {
3019         struct qinst *ldunifa =
3020                 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3021         ldunifa->qpu.sig.ldunifa = true;
3022         if (result)
3023                 *result = vir_emit_def(c, ldunifa);
3024         else
3025                 vir_emit_nondef(c, ldunifa);
3026         c->current_unifa_offset += 4;
3027 }
3028 
3029 /* Checks if the value of a nir src is derived from a nir register */
3030 static bool
nir_src_derived_from_reg(nir_src src)3031 nir_src_derived_from_reg(nir_src src)
3032 {
3033         nir_def *def = src.ssa;
3034         if (nir_load_reg_for_def(def))
3035                 return true;
3036 
3037         nir_instr *parent = def->parent_instr;
3038         switch (parent->type) {
3039         case nir_instr_type_alu: {
3040                 nir_alu_instr *alu = nir_instr_as_alu(parent);
3041                 int num_srcs = nir_op_infos[alu->op].num_inputs;
3042                 for (int i = 0; i < num_srcs; i++) {
3043                         if (nir_src_derived_from_reg(alu->src[i].src))
3044                                 return true;
3045                 }
3046                 return false;
3047         }
3048         case nir_instr_type_intrinsic: {
3049                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3050                 int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
3051                 for (int i = 0; i < num_srcs; i++) {
3052                         if (nir_src_derived_from_reg(intr->src[i]))
3053                                 return true;
3054                 }
3055                 return false;
3056         }
3057         case nir_instr_type_load_const:
3058         case nir_instr_type_undef:
3059                 return false;
3060         default:
3061                 /* By default we assume it may come from a register, the above
3062                  * cases should be able to handle the majority of situations
3063                  * though.
3064                  */
3065                 return true;
3066         };
3067 }
3068 
3069 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3070 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3071 {
3072         assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3073                instr->intrinsic == nir_intrinsic_load_ssbo ||
3074                instr->intrinsic == nir_intrinsic_load_uniform);
3075 
3076         bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3077         bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3078         bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3079 
3080         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3081          * current unifa offset is 4 bytes ahead of the offset of the last load.
3082          */
3083         static const int32_t max_unifa_skip_dist =
3084                 MAX_UNIFA_SKIP_DISTANCE - 4;
3085 
3086         /* We can only use unifa if the offset is uniform */
3087         nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3088         if (nir_src_is_divergent(offset))
3089                 return false;
3090 
3091         /* Emitting loads from unifa may not be safe under non-uniform control
3092          * flow. It seems the address that is used to write to the unifa
3093          * register is taken from the first lane and if that lane is disabled
3094          * by control flow then the value we read may be bogus and lead to
3095          * invalid memory accesses on follow-up ldunifa instructions. However,
3096          * ntq_store_def only emits conditional writes for nir registersas long
3097          * we can be certain that the offset isn't derived from a load_reg we
3098          * should be fine.
3099          *
3100          * The following CTS test can be used to trigger the problem, which
3101          * causes a GMP violations in the sim without this check:
3102          * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
3103          */
3104         if (vir_in_nonuniform_control_flow(c) &&
3105             nir_src_derived_from_reg(offset)) {
3106                 return false;
3107         }
3108 
3109         /* We can only use unifa with SSBOs if they are read-only. Otherwise
3110          * ldunifa won't see the shader writes to that address (possibly
3111          * because ldunifa doesn't read from the L2T cache).
3112          */
3113         if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3114                 return false;
3115 
3116         /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3117          * that we may have been written to scratch using the TMU.
3118          */
3119         bool dynamic_src = !nir_src_is_const(offset);
3120         if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3121                 return false;
3122 
3123         uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3124         if (is_uniform)
3125                 const_offset += nir_intrinsic_base(instr);
3126 
3127         /* ldunifa is a 32-bit load instruction so we can only use it with
3128          * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3129          * except for types smaller than 32-bit, so in these cases we can only
3130          * use ldunifa if we can verify alignment, which we can only do for
3131          * loads with a constant offset.
3132          */
3133         uint32_t bit_size = instr->def.bit_size;
3134         uint32_t value_skips = 0;
3135         if (bit_size < 32) {
3136                 if (dynamic_src) {
3137                         return false;
3138                 } else if (const_offset % 4 != 0) {
3139                         /* If we are loading from an unaligned offset, fix
3140                          * alignment and skip over unused elements in result.
3141                          */
3142                         value_skips = (const_offset % 4) / (bit_size / 8);
3143                         const_offset &= ~0x3;
3144                 }
3145         }
3146 
3147         assert((bit_size == 32 && value_skips == 0) ||
3148                (bit_size == 16 && value_skips <= 1) ||
3149                (bit_size == 8  && value_skips <= 3));
3150 
3151         /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3152          * constants.
3153          */
3154         uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3155 
3156         /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
3157          * index 0 for Gallium's constant buffer (GL) or push constants
3158          * (Vulkan).
3159          */
3160         if (is_ubo)
3161                 index++;
3162 
3163         /* We can only keep track of the last unifa address we used with
3164          * constant offset loads. If the new load targets the same buffer and
3165          * is close enough to the previous load, we can skip the unifa register
3166          * write by emitting dummy ldunifa instructions to update the unifa
3167          * address.
3168          */
3169         bool skip_unifa = false;
3170         uint32_t ldunifa_skips = 0;
3171         if (dynamic_src) {
3172                 c->current_unifa_block = NULL;
3173         } else if (c->cur_block == c->current_unifa_block &&
3174                    c->current_unifa_is_ubo == !is_ssbo &&
3175                    c->current_unifa_index == index &&
3176                    c->current_unifa_offset <= const_offset &&
3177                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3178                 skip_unifa = true;
3179                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3180         } else {
3181                 c->current_unifa_block = c->cur_block;
3182                 c->current_unifa_is_ubo = !is_ssbo;
3183                 c->current_unifa_index = index;
3184                 c->current_unifa_offset = const_offset;
3185         }
3186 
3187         if (!skip_unifa) {
3188                 struct qreg base_offset = !is_ssbo ?
3189                         vir_uniform(c, QUNIFORM_UBO_ADDR,
3190                                     v3d_unit_data_create(index, const_offset)) :
3191                         vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3192 
3193                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3194                 if (!dynamic_src) {
3195                         if (!is_ssbo) {
3196                                 /* Avoid the extra MOV to UNIFA by making
3197                                  * ldunif load directly into it. We can't
3198                                  * do this if we have not actually emitted
3199                                  * ldunif and are instead reusing a previous
3200                                  * one.
3201                                  */
3202                                 struct qinst *inst =
3203                                         (struct qinst *)c->cur_block->instructions.prev;
3204                                 if (inst == c->defs[base_offset.index]) {
3205                                    inst->dst = unifa;
3206                                    c->defs[base_offset.index] = NULL;
3207                                 } else {
3208                                    vir_MOV_dest(c, unifa, base_offset);
3209                                 }
3210                         } else {
3211                                 vir_ADD_dest(c, unifa, base_offset,
3212                                              vir_uniform_ui(c, const_offset));
3213                         }
3214                 } else {
3215                         vir_ADD_dest(c, unifa, base_offset,
3216                                      ntq_get_src(c, offset, 0));
3217                 }
3218         } else {
3219                 for (int i = 0; i < ldunifa_skips; i++)
3220                         emit_ldunifa(c, NULL);
3221         }
3222 
3223         uint32_t num_components = nir_intrinsic_dest_components(instr);
3224         for (uint32_t i = 0; i < num_components; ) {
3225                 struct qreg data;
3226                 emit_ldunifa(c, &data);
3227 
3228                 if (bit_size == 32) {
3229                         assert(value_skips == 0);
3230                         ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
3231                         i++;
3232                 } else {
3233                         assert((bit_size == 16 && value_skips <= 1) ||
3234                                (bit_size ==  8 && value_skips <= 3));
3235 
3236                         /* If we have any values to skip, shift to the first
3237                          * valid value in the ldunifa result.
3238                          */
3239                         if (value_skips > 0) {
3240                                 data = vir_SHR(c, data,
3241                                                vir_uniform_ui(c, bit_size *
3242                                                                  value_skips));
3243                         }
3244 
3245                         /* Check how many valid components we have discounting
3246                          * read components to skip.
3247                          */
3248                         uint32_t valid_count = (32 / bit_size) - value_skips;
3249                         assert((bit_size == 16 && valid_count <= 2) ||
3250                                (bit_size ==  8 && valid_count <= 4));
3251                         assert(valid_count > 0);
3252 
3253                         /* Process the valid components */
3254                         do {
3255                                 struct qreg tmp;
3256                                 uint32_t mask = (1 << bit_size) - 1;
3257                                 tmp = vir_AND(c, vir_MOV(c, data),
3258                                               vir_uniform_ui(c, mask));
3259                                 ntq_store_def(c, &instr->def, i,
3260                                               vir_MOV(c, tmp));
3261                                 i++;
3262                                 valid_count--;
3263 
3264                                 /* Shift to next component */
3265                                 if (i < num_components && valid_count > 0) {
3266                                         data = vir_SHR(c, data,
3267                                                        vir_uniform_ui(c, bit_size));
3268                                 }
3269                         } while (i < num_components && valid_count > 0);
3270                 }
3271         }
3272 
3273         return true;
3274 }
3275 
3276 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3277 emit_load_local_invocation_index(struct v3d_compile *c)
3278 {
3279         return vir_SHR(c, c->cs_payload[1],
3280                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3281 }
3282 
3283 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
3284  * fragment shaders a lane is considered active if any sample flags are set
3285  * for *any* lane in the same quad, however, we still need to ensure that
3286  * terminated lanes (OpTerminate) are not included. Further, we also need to
3287  * disable lanes that may be disabled because of non-uniform control
3288  * flow.
3289  */
3290 static enum v3d_qpu_cond
setup_subgroup_control_flow_condition(struct v3d_compile * c)3291 setup_subgroup_control_flow_condition(struct v3d_compile *c)
3292 {
3293         assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
3294                c->s->info.stage == MESA_SHADER_COMPUTE);
3295 
3296         enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
3297 
3298         /* We need to make sure that terminated lanes in fragment shaders are
3299          * not included. We can identify these lanes by comparing the inital
3300          * sample mask with the current. This fixes:
3301          * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
3302          */
3303         if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
3304                 vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
3305                                            vir_NOT(c, vir_XOR(c, c->start_msf,
3306                                                               vir_MSF(c)))),
3307                            V3D_QPU_PF_PUSHZ);
3308                 cond = V3D_QPU_COND_IFNA;
3309         }
3310 
3311         /* If we are in non-uniform control-flow update the condition to
3312          * also limit lanes to those in the current execution mask.
3313          */
3314         if (vir_in_nonuniform_control_flow(c)) {
3315                 if (cond == V3D_QPU_COND_IFNA) {
3316                         vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3317                                    V3D_QPU_UF_NORNZ);
3318                 } else {
3319                         assert(cond == V3D_QPU_COND_NONE);
3320                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3321                                    V3D_QPU_PF_PUSHZ);
3322                 }
3323                 cond = V3D_QPU_COND_IFA;
3324         }
3325 
3326         return cond;
3327 }
3328 
3329 static void
emit_compute_barrier(struct v3d_compile * c)3330 emit_compute_barrier(struct v3d_compile *c)
3331 {
3332         /* Ensure we flag the use of the control barrier. NIR's
3333          * gather info pass usually takes care of this, but that
3334          * requires that we call that pass after any other pass
3335          * may emit a control barrier, so this is safer.
3336          */
3337         c->s->info.uses_control_barrier = true;
3338 
3339         /* Emit a TSY op to get all invocations in the workgroup
3340          * (actually supergroup) to block until the last
3341          * invocation reaches the TSY op.
3342          */
3343         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
3344 }
3345 
3346 static void
emit_barrier(struct v3d_compile * c)3347 emit_barrier(struct v3d_compile *c)
3348 {
3349         struct qreg eidx = vir_EIDX(c);
3350 
3351         /* The config for the TSY op should be setup like this:
3352          * - Lane 0: Quorum
3353          * - Lane 2: TSO id
3354          * - Lane 3: TSY opcode
3355          */
3356 
3357         /* Lane 0: we want to synchronize across one subgroup. Here we write to
3358          * all lanes unconditionally and will overwrite other lanes below.
3359          */
3360         struct qreg tsy_conf = vir_uniform_ui(c, 1);
3361 
3362         /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
3363          * curent QPU index and thread index to ensure we get a unique one for
3364          * this group of invocations in this core.
3365          */
3366         struct qreg tso_id =
3367                 vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
3368         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
3369                    V3D_QPU_PF_PUSHZ);
3370         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
3371 
3372         /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
3373         struct qreg tsy_op = vir_uniform_ui(c, 16);
3374         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
3375                    V3D_QPU_PF_PUSHZ);
3376         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
3377 
3378         /* Emit TSY sync */
3379         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
3380 }
3381 
3382 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3383 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3384 {
3385         switch (instr->intrinsic) {
3386         case nir_intrinsic_decl_reg:
3387         case nir_intrinsic_load_reg:
3388         case nir_intrinsic_store_reg:
3389                 break; /* Ignore these */
3390 
3391         case nir_intrinsic_load_uniform:
3392                 ntq_emit_load_uniform(c, instr);
3393                 break;
3394 
3395         case nir_intrinsic_load_global_2x32:
3396                 ntq_emit_tmu_general(c, instr, false, true);
3397                 c->has_general_tmu_load = true;
3398                 break;
3399 
3400         case nir_intrinsic_load_ubo:
3401            if (ntq_emit_inline_ubo_load(c, instr))
3402                    break;
3403            FALLTHROUGH;
3404         case nir_intrinsic_load_ssbo:
3405                 if (!ntq_emit_load_unifa(c, instr)) {
3406                         ntq_emit_tmu_general(c, instr, false, false);
3407                         c->has_general_tmu_load = true;
3408                 }
3409                 break;
3410 
3411         case nir_intrinsic_store_ssbo:
3412         case nir_intrinsic_ssbo_atomic:
3413         case nir_intrinsic_ssbo_atomic_swap:
3414                 ntq_emit_tmu_general(c, instr, false, false);
3415                 break;
3416 
3417         case nir_intrinsic_store_global_2x32:
3418         case nir_intrinsic_global_atomic_2x32:
3419         case nir_intrinsic_global_atomic_swap_2x32:
3420                 ntq_emit_tmu_general(c, instr, false, true);
3421                 break;
3422 
3423         case nir_intrinsic_shared_atomic:
3424         case nir_intrinsic_shared_atomic_swap:
3425         case nir_intrinsic_store_shared:
3426         case nir_intrinsic_store_scratch:
3427                 ntq_emit_tmu_general(c, instr, true, false);
3428                 break;
3429 
3430         case nir_intrinsic_load_scratch:
3431         case nir_intrinsic_load_shared:
3432                 ntq_emit_tmu_general(c, instr, true, false);
3433                 c->has_general_tmu_load = true;
3434                 break;
3435 
3436         case nir_intrinsic_image_store:
3437         case nir_intrinsic_image_atomic:
3438         case nir_intrinsic_image_atomic_swap:
3439                 v3d_vir_emit_image_load_store(c, instr);
3440                 break;
3441 
3442         case nir_intrinsic_image_load:
3443                 v3d_vir_emit_image_load_store(c, instr);
3444                 /* Not really a general TMU load, but we only use this flag
3445                  * for NIR scheduling and we do schedule these under the same
3446                  * policy as general TMU.
3447                  */
3448                 c->has_general_tmu_load = true;
3449                 break;
3450 
3451         case nir_intrinsic_get_ssbo_size:
3452                 ntq_store_def(c, &instr->def, 0,
3453                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3454                                           nir_src_comp_as_uint(instr->src[0], 0)));
3455                 break;
3456 
3457         case nir_intrinsic_get_ubo_size:
3458                 ntq_store_def(c, &instr->def, 0,
3459                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3460                                           nir_src_comp_as_uint(instr->src[0], 0)));
3461                 break;
3462 
3463         case nir_intrinsic_load_user_clip_plane:
3464                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3465                         ntq_store_def(c, &instr->def, i,
3466                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3467                                                   nir_intrinsic_ucp_id(instr) *
3468                                                   4 + i));
3469                 }
3470                 break;
3471 
3472         case nir_intrinsic_load_viewport_x_scale:
3473                 ntq_store_def(c, &instr->def, 0,
3474                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3475                 break;
3476 
3477         case nir_intrinsic_load_viewport_y_scale:
3478                 ntq_store_def(c, &instr->def, 0,
3479                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3480                 break;
3481 
3482         case nir_intrinsic_load_viewport_z_scale:
3483                 ntq_store_def(c, &instr->def, 0,
3484                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3485                 break;
3486 
3487         case nir_intrinsic_load_viewport_z_offset:
3488                 ntq_store_def(c, &instr->def, 0,
3489                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3490                 break;
3491 
3492         case nir_intrinsic_load_line_coord:
3493                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
3494                 break;
3495 
3496         case nir_intrinsic_load_line_width:
3497                 ntq_store_def(c, &instr->def, 0,
3498                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3499                 break;
3500 
3501         case nir_intrinsic_load_aa_line_width:
3502                 ntq_store_def(c, &instr->def, 0,
3503                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3504                 break;
3505 
3506         case nir_intrinsic_load_sample_mask_in:
3507                 ntq_store_def(c, &instr->def, 0, vir_MSF(c));
3508                 break;
3509 
3510         case nir_intrinsic_load_helper_invocation:
3511                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3512                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3513                 ntq_store_def(c, &instr->def, 0, qdest);
3514                 break;
3515 
3516         case nir_intrinsic_load_front_face:
3517                 /* The register contains 0 (front) or 1 (back), and we need to
3518                  * turn it into a NIR bool where true means front.
3519                  */
3520                 ntq_store_def(c, &instr->def, 0,
3521                               vir_ADD(c,
3522                                       vir_uniform_ui(c, -1),
3523                                       vir_REVF(c)));
3524                 break;
3525 
3526         case nir_intrinsic_load_base_instance:
3527                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
3528                 break;
3529 
3530         case nir_intrinsic_load_instance_id:
3531                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
3532                 break;
3533 
3534         case nir_intrinsic_load_vertex_id:
3535                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
3536                 break;
3537 
3538         case nir_intrinsic_load_draw_id:
3539                 ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
3540                 break;
3541 
3542         case nir_intrinsic_load_tlb_color_v3d:
3543                 vir_emit_tlb_color_read(c, instr);
3544                 break;
3545 
3546         case nir_intrinsic_load_input:
3547                 ntq_emit_load_input(c, instr);
3548                 break;
3549 
3550         case nir_intrinsic_store_tlb_sample_color_v3d:
3551                ntq_emit_per_sample_color_write(c, instr);
3552                break;
3553 
3554        case nir_intrinsic_store_output:
3555                 ntq_emit_store_output(c, instr);
3556                 break;
3557 
3558         case nir_intrinsic_image_size:
3559                 ntq_emit_image_size(c, instr);
3560                 break;
3561 
3562         /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
3563          * is intended to match the semantics of GLSL's discard) should
3564          * terminate the invocation immediately. Our implementation doesn't
3565          * do that. What we do is actually a demote by removing the invocations
3566          * from the sample mask. Maybe we could be more strict and force an
3567          * early termination by emitting a (maybe conditional) jump to the
3568          * end section of the fragment shader for affected invocations.
3569          */
3570         case nir_intrinsic_discard:
3571         case nir_intrinsic_terminate:
3572                 c->emitted_discard = true;
3573                 FALLTHROUGH;
3574         case nir_intrinsic_demote:
3575                 ntq_flush_tmu(c);
3576 
3577                 if (vir_in_nonuniform_control_flow(c)) {
3578                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3579                                    V3D_QPU_PF_PUSHZ);
3580                         vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3581                                                      vir_uniform_ui(c, 0)),
3582                                 V3D_QPU_COND_IFA);
3583                 } else {
3584                         vir_SETMSF_dest(c, vir_nop_reg(),
3585                                         vir_uniform_ui(c, 0));
3586                 }
3587                 break;
3588 
3589         case nir_intrinsic_discard_if:
3590         case nir_intrinsic_terminate_if:
3591                 c->emitted_discard = true;
3592                 FALLTHROUGH;
3593         case nir_intrinsic_demote_if: {
3594                 ntq_flush_tmu(c);
3595 
3596                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3597 
3598                 if (vir_in_nonuniform_control_flow(c)) {
3599                         struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3600                                                                c->execute);
3601                         if (cond == V3D_QPU_COND_IFA) {
3602                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3603                         } else {
3604                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3605                                 cond = V3D_QPU_COND_IFA;
3606                         }
3607                 }
3608 
3609                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3610                                              vir_uniform_ui(c, 0)), cond);
3611                 break;
3612         }
3613 
3614         case nir_intrinsic_barrier:
3615                 /* Ensure that the TMU operations before the barrier are flushed
3616                  * before the ones after the barrier.
3617                  */
3618                 ntq_flush_tmu(c);
3619 
3620                 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
3621                         if (c->s->info.stage == MESA_SHADER_COMPUTE)
3622                                 emit_compute_barrier(c);
3623                         else
3624                                 emit_barrier(c);
3625 
3626                         /* The blocking of a TSY op only happens at the next
3627                          * thread switch. No texturing may be outstanding at the
3628                          * time of a TSY blocking operation.
3629                          */
3630                         vir_emit_thrsw(c);
3631                 }
3632                 break;
3633 
3634         case nir_intrinsic_load_num_workgroups:
3635                 for (int i = 0; i < 3; i++) {
3636                         ntq_store_def(c, &instr->def, i,
3637                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3638                                                   i));
3639                 }
3640                 break;
3641 
3642         case nir_intrinsic_load_workgroup_id_zero_base:
3643         case nir_intrinsic_load_workgroup_id: {
3644                 struct qreg x = vir_AND(c, c->cs_payload[0],
3645                                          vir_uniform_ui(c, 0xffff));
3646                 ntq_store_def(c, &instr->def, 0, x);
3647 
3648                 struct qreg y = vir_SHR(c, c->cs_payload[0],
3649                                          vir_uniform_ui(c, 16));
3650                 ntq_store_def(c, &instr->def, 1, y);
3651 
3652                 struct qreg z = vir_AND(c, c->cs_payload[1],
3653                                          vir_uniform_ui(c, 0xffff));
3654                 ntq_store_def(c, &instr->def, 2, z);
3655                 break;
3656         }
3657 
3658         case nir_intrinsic_load_base_workgroup_id: {
3659                 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
3660                 ntq_store_def(c, &instr->def, 0, x);
3661 
3662                 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
3663                 ntq_store_def(c, &instr->def, 1, y);
3664 
3665                 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
3666                 ntq_store_def(c, &instr->def, 2, z);
3667                 break;
3668         }
3669 
3670         case nir_intrinsic_load_local_invocation_index:
3671                 ntq_store_def(c, &instr->def, 0,
3672                               emit_load_local_invocation_index(c));
3673                 break;
3674 
3675         case nir_intrinsic_load_subgroup_id: {
3676                 /* This is basically the batch index, which is the Local
3677                  * Invocation Index divided by the SIMD width).
3678                  */
3679                 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
3680                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3681                 struct qreg lii = emit_load_local_invocation_index(c);
3682                 ntq_store_def(c, &instr->def, 0,
3683                               vir_SHR(c, lii,
3684                                       vir_uniform_ui(c, divide_shift)));
3685                 break;
3686         }
3687 
3688         case nir_intrinsic_load_per_vertex_input: {
3689                 /* The vertex shader writes all its used outputs into
3690                  * consecutive VPM offsets, so if any output component is
3691                  * unused, its VPM offset is used by the next used
3692                  * component. This means that we can't assume that each
3693                  * location will use 4 consecutive scalar offsets in the VPM
3694                  * and we need to compute the VPM offset for each input by
3695                  * going through the inputs and finding the one that matches
3696                  * our location and component.
3697                  *
3698                  * col: vertex index, row = varying index
3699                  */
3700                 assert(nir_src_is_const(instr->src[1]));
3701                 uint32_t location =
3702                         nir_intrinsic_io_semantics(instr).location +
3703                         nir_src_as_uint(instr->src[1]);
3704                 uint32_t component = nir_intrinsic_component(instr);
3705 
3706                 int32_t row_idx = -1;
3707                 for (int i = 0; i < c->num_inputs; i++) {
3708                         struct v3d_varying_slot slot = c->input_slots[i];
3709                         if (v3d_slot_get_slot(slot) == location &&
3710                             v3d_slot_get_component(slot) == component) {
3711                                 row_idx = i;
3712                                 break;
3713                         }
3714                 }
3715 
3716                 assert(row_idx != -1);
3717 
3718                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3719                 for (int i = 0; i < instr->num_components; i++) {
3720                         struct qreg row = vir_uniform_ui(c, row_idx++);
3721                         ntq_store_def(c, &instr->def, i,
3722                                       vir_LDVPMG_IN(c, row, col));
3723                 }
3724                 break;
3725         }
3726 
3727         case nir_intrinsic_emit_vertex:
3728         case nir_intrinsic_end_primitive:
3729                 unreachable("Should have been lowered in v3d_nir_lower_io");
3730                 break;
3731 
3732         case nir_intrinsic_load_primitive_id: {
3733                 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3734                  * VPM output header. According to docs, we should read this
3735                  * using ldvpm(v,d)_in (See Table 71).
3736                  */
3737                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3738                 ntq_store_def(c, &instr->def, 0,
3739                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3740                 break;
3741         }
3742 
3743         case nir_intrinsic_load_invocation_id:
3744                 ntq_store_def(c, &instr->def, 0, vir_IID(c));
3745                 break;
3746 
3747         case nir_intrinsic_load_fb_layers_v3d:
3748                 ntq_store_def(c, &instr->def, 0,
3749                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3750                 break;
3751 
3752         case nir_intrinsic_load_sample_id:
3753                 ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
3754                 break;
3755 
3756         case nir_intrinsic_load_sample_pos:
3757                 ntq_store_def(c, &instr->def, 0,
3758                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3759                 ntq_store_def(c, &instr->def, 1,
3760                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3761                 break;
3762 
3763         case nir_intrinsic_load_barycentric_at_offset:
3764                 ntq_store_def(c, &instr->def, 0,
3765                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3766                 ntq_store_def(c, &instr->def, 1,
3767                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3768                 break;
3769 
3770         case nir_intrinsic_load_barycentric_pixel:
3771                 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3772                 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3773                 break;
3774 
3775         case nir_intrinsic_load_barycentric_at_sample: {
3776                 if (!c->fs_key->msaa) {
3777                         ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3778                         ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3779                         return;
3780                 }
3781 
3782                 struct qreg offset_x, offset_y;
3783                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3784                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3785 
3786                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3787                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3788                 break;
3789         }
3790 
3791         case nir_intrinsic_load_barycentric_sample: {
3792                 struct qreg offset_x =
3793                         vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3794                 struct qreg offset_y =
3795                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3796 
3797                 ntq_store_def(c, &instr->def, 0,
3798                              vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3799                 ntq_store_def(c, &instr->def, 1,
3800                               vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3801                 break;
3802         }
3803 
3804         case nir_intrinsic_load_barycentric_centroid: {
3805                 struct qreg offset_x, offset_y;
3806                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3807                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3808                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3809                 break;
3810         }
3811 
3812         case nir_intrinsic_load_interpolated_input: {
3813                 assert(nir_src_is_const(instr->src[1]));
3814                 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3815 
3816                 for (int i = 0; i < instr->num_components; i++) {
3817                         const uint32_t input_idx =
3818                                 (nir_intrinsic_base(instr) + offset) * 4 +
3819                                 nir_intrinsic_component(instr) + i;
3820 
3821                         /* If we are not in MSAA or if we are not interpolating
3822                          * a user varying, just return the pre-computed
3823                          * interpolated input.
3824                          */
3825                         if (!c->fs_key->msaa ||
3826                             c->interp[input_idx].vp.file == QFILE_NULL) {
3827                                 ntq_store_def(c, &instr->def, i,
3828                                               vir_MOV(c, c->inputs[input_idx]));
3829                                 continue;
3830                         }
3831 
3832                         /* Otherwise compute interpolation at the specified
3833                          * offset.
3834                          */
3835                         struct qreg p = c->interp[input_idx].vp;
3836                         struct qreg C = c->interp[input_idx].C;
3837                         unsigned interp_mode =  c->interp[input_idx].mode;
3838 
3839                         struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3840                         struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3841 
3842                         struct qreg result =
3843                               ntq_emit_load_interpolated_input(c, p, C,
3844                                                                offset_x, offset_y,
3845                                                                interp_mode);
3846                         ntq_store_def(c, &instr->def, i, result);
3847                 }
3848                 break;
3849         }
3850 
3851         case nir_intrinsic_load_subgroup_size:
3852                 ntq_store_def(c, &instr->def, 0,
3853                               vir_uniform_ui(c, V3D_CHANNELS));
3854                 break;
3855 
3856         case nir_intrinsic_load_subgroup_invocation:
3857                 ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
3858                 break;
3859 
3860         case nir_intrinsic_elect: {
3861                 struct qreg first;
3862                 if (vir_in_nonuniform_control_flow(c)) {
3863                         /* Sets A=1 for lanes enabled in the execution mask */
3864                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3865                                    V3D_QPU_PF_PUSHZ);
3866                         /* Updates A ANDing with lanes enabled in MSF */
3867                         vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
3868                                    V3D_QPU_UF_ANDNZ);
3869                         first = vir_FLAFIRST(c);
3870                 } else {
3871                         /* Sets A=1 for inactive lanes */
3872                         vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
3873                                    V3D_QPU_PF_PUSHZ);
3874                         first = vir_FLNAFIRST(c);
3875                 }
3876 
3877                 /* Produce a boolean result */
3878                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3879                                            first, vir_uniform_ui(c, 1)),
3880                                            V3D_QPU_PF_PUSHZ);
3881                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3882                 ntq_store_def(c, &instr->def, 0, result);
3883                 break;
3884         }
3885 
3886         case nir_intrinsic_ballot: {
3887                 assert(c->devinfo->ver >= 71);
3888                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3889                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3890                 struct qreg res = vir_get_temp(c);
3891                 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
3892                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3893                 break;
3894         }
3895 
3896         case nir_intrinsic_read_invocation: {
3897                 assert(c->devinfo->ver >= 71);
3898                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3899                 struct qreg index = ntq_get_src(c, instr->src[1], 0);
3900                 struct qreg res = vir_SHUFFLE(c, value, index);
3901                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3902                 break;
3903         }
3904 
3905         case nir_intrinsic_read_first_invocation: {
3906                 assert(c->devinfo->ver >= 71);
3907                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3908                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3909                 struct qreg res = vir_get_temp(c);
3910                 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
3911                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3912                 break;
3913         }
3914 
3915         case nir_intrinsic_shuffle: {
3916                 assert(c->devinfo->ver >= 71);
3917                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3918                 struct qreg indices = ntq_get_src(c, instr->src[1], 0);
3919                 struct qreg res = vir_SHUFFLE(c, value, indices);
3920                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3921                 break;
3922         }
3923 
3924         case nir_intrinsic_vote_feq:
3925         case nir_intrinsic_vote_ieq: {
3926                 assert(c->devinfo->ver >= 71);
3927                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3928                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3929                 struct qreg res = vir_get_temp(c);
3930                 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
3931                              vir_ALLEQ_dest(c, res, value) :
3932                              vir_ALLFEQ_dest(c, res, value),
3933                              cond);
3934 
3935                 /* Produce boolean result */
3936                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
3937                            V3D_QPU_PF_PUSHZ);
3938                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
3939                 ntq_store_def(c, &instr->def, 0, result);
3940                 break;
3941         }
3942 
3943         case nir_intrinsic_vote_all: {
3944                 assert(c->devinfo->ver >= 71);
3945                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3946                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3947                 struct qreg res = vir_get_temp(c);
3948                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
3949 
3950                 /* We want to check if 'all lanes are equal (alleq != 0) and
3951                  * their value is True (value != 0)'.
3952                  *
3953                  * The first MOV.pushz generates predicate for 'alleq == 0'.
3954                  * The second MOV.NORZ generates predicate for:
3955                  * '!(alleq == 0) & !(value == 0).
3956                  */
3957                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
3958                            V3D_QPU_PF_PUSHZ);
3959                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
3960                            V3D_QPU_UF_NORZ);
3961                 struct qreg result =
3962                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3963                 ntq_store_def(c, &instr->def, 0, result);
3964                 break;
3965         }
3966 
3967         case nir_intrinsic_vote_any: {
3968                 assert(c->devinfo->ver >= 71);
3969                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3970                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3971                 struct qreg res = vir_get_temp(c);
3972                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
3973 
3974                 /* We want to check 'not (all lanes are equal (alleq != 0)'
3975                  * and their value is False (value == 0))'.
3976                  *
3977                  * The first MOV.pushz generates predicate for 'alleq == 0'.
3978                  * The second MOV.NORNZ generates predicate for:
3979                  * '!(alleq == 0) & (value == 0).
3980                  * The IFNA condition negates the predicate when evaluated:
3981                  * '!(!alleq == 0) & (value == 0))
3982                  */
3983                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
3984                            V3D_QPU_PF_PUSHZ);
3985                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
3986                            V3D_QPU_UF_NORNZ);
3987                 struct qreg result =
3988                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
3989                 ntq_store_def(c, &instr->def, 0, result);
3990                 break;
3991         }
3992 
3993         case nir_intrinsic_load_num_subgroups:
3994                 unreachable("Should have been lowered");
3995                 break;
3996 
3997         case nir_intrinsic_load_view_index:
3998                 ntq_store_def(c, &instr->def, 0,
3999                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
4000                 break;
4001 
4002         default:
4003                 fprintf(stderr, "Unknown intrinsic: ");
4004                 nir_print_instr(&instr->instr, stderr);
4005                 fprintf(stderr, "\n");
4006                 break;
4007         }
4008 }
4009 
4010 /* Clears (activates) the execute flags for any channels whose jump target
4011  * matches this block.
4012  *
4013  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
4014  * enabling?
4015  *
4016  */
4017 static void
ntq_activate_execute_for_block(struct v3d_compile * c)4018 ntq_activate_execute_for_block(struct v3d_compile *c)
4019 {
4020         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4021                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
4022                    V3D_QPU_PF_PUSHZ);
4023 
4024         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4025 }
4026 
4027 static bool
is_cheap_block(nir_block * block)4028 is_cheap_block(nir_block *block)
4029 {
4030         int32_t cost = 3;
4031         nir_foreach_instr(instr, block) {
4032                 switch (instr->type) {
4033                 case nir_instr_type_alu:
4034                 case nir_instr_type_undef:
4035                 case nir_instr_type_load_const:
4036                         if (--cost <= 0)
4037                                 return false;
4038                 break;
4039                 case nir_instr_type_intrinsic: {
4040                         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4041                         switch (intr->intrinsic) {
4042                         case nir_intrinsic_decl_reg:
4043                         case nir_intrinsic_load_reg:
4044                         case nir_intrinsic_store_reg:
4045                                 continue;
4046                         default:
4047                                 return false;
4048                         }
4049                 }
4050                 default:
4051                         return false;
4052                 }
4053         }
4054         return true;
4055 }
4056 
4057 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)4058 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
4059 {
4060         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4061         bool empty_else_block =
4062                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4063                  exec_list_is_empty(&nir_else_block->instr_list));
4064 
4065         struct qblock *then_block = vir_new_block(c);
4066         struct qblock *after_block = vir_new_block(c);
4067         struct qblock *else_block;
4068         if (empty_else_block)
4069                 else_block = after_block;
4070         else
4071                 else_block = vir_new_block(c);
4072 
4073         /* Check if this if statement is really just a conditional jump with
4074          * the form:
4075          *
4076          * if (cond) {
4077          *    break/continue;
4078          * } else {
4079          * }
4080          *
4081          * In which case we can skip the jump to ELSE we emit before the THEN
4082          * block and instead just emit the break/continue directly.
4083          */
4084         nir_jump_instr *conditional_jump = NULL;
4085         if (empty_else_block) {
4086                 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
4087                 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
4088                 if (inst && inst->type == nir_instr_type_jump)
4089                         conditional_jump = nir_instr_as_jump(inst);
4090         }
4091 
4092         /* Set up the flags for the IF condition (taking the THEN branch). */
4093         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4094 
4095         if (!conditional_jump) {
4096                 /* Jump to ELSE. */
4097                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4098                            V3D_QPU_BRANCH_COND_ANYNA :
4099                            V3D_QPU_BRANCH_COND_ANYA);
4100                 /* Pixels that were not dispatched or have been discarded
4101                  * should not contribute to the ANYA/ANYNA condition.
4102                  */
4103                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4104 
4105                 vir_link_blocks(c->cur_block, else_block);
4106                 vir_link_blocks(c->cur_block, then_block);
4107 
4108                 /* Process the THEN block. */
4109                 vir_set_emit_block(c, then_block);
4110                 ntq_emit_cf_list(c, &if_stmt->then_list);
4111 
4112                 if (!empty_else_block) {
4113                         /* At the end of the THEN block, jump to ENDIF, unless
4114                          * the block ended in a break or continue.
4115                          */
4116                         if (!c->cur_block->branch_emitted) {
4117                                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4118                                 vir_link_blocks(c->cur_block, after_block);
4119                         }
4120 
4121                         /* Emit the else block. */
4122                         vir_set_emit_block(c, else_block);
4123                         ntq_emit_cf_list(c, &if_stmt->else_list);
4124                 }
4125         } else {
4126                 /* Emit the conditional jump directly.
4127                  *
4128                  * Use ALL with breaks and ANY with continues to ensure that
4129                  * we always break and never continue when all lanes have been
4130                  * disabled (for example because of discards) to prevent
4131                  * infinite loops.
4132                  */
4133                 assert(conditional_jump &&
4134                        (conditional_jump->type == nir_jump_continue ||
4135                         conditional_jump->type == nir_jump_break));
4136 
4137                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4138                            (conditional_jump->type == nir_jump_break ?
4139                             V3D_QPU_BRANCH_COND_ALLA :
4140                             V3D_QPU_BRANCH_COND_ANYA) :
4141                            (conditional_jump->type == nir_jump_break ?
4142                             V3D_QPU_BRANCH_COND_ALLNA :
4143                             V3D_QPU_BRANCH_COND_ANYNA));
4144                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4145 
4146                 vir_link_blocks(c->cur_block,
4147                                 conditional_jump->type == nir_jump_break ?
4148                                         c->loop_break_block :
4149                                         c->loop_cont_block);
4150         }
4151 
4152         vir_link_blocks(c->cur_block, after_block);
4153 
4154         vir_set_emit_block(c, after_block);
4155 }
4156 
4157 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)4158 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
4159 {
4160         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4161         bool empty_else_block =
4162                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4163                  exec_list_is_empty(&nir_else_block->instr_list));
4164 
4165         struct qblock *then_block = vir_new_block(c);
4166         struct qblock *after_block = vir_new_block(c);
4167         struct qblock *else_block;
4168         if (empty_else_block)
4169                 else_block = after_block;
4170         else
4171                 else_block = vir_new_block(c);
4172 
4173         bool was_uniform_control_flow = false;
4174         if (!vir_in_nonuniform_control_flow(c)) {
4175                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4176                 was_uniform_control_flow = true;
4177         }
4178 
4179         /* Set up the flags for the IF condition (taking the THEN branch). */
4180         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4181 
4182         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
4183          * was previously active (execute Z) for updating the exec flags.
4184          */
4185         if (was_uniform_control_flow) {
4186                 cond = v3d_qpu_cond_invert(cond);
4187         } else {
4188                 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
4189                 if (cond == V3D_QPU_COND_IFA) {
4190                         vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
4191                 } else {
4192                         vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
4193                         cond = V3D_QPU_COND_IFA;
4194                 }
4195         }
4196 
4197         vir_MOV_cond(c, cond,
4198                      c->execute,
4199                      vir_uniform_ui(c, else_block->index));
4200 
4201         /* Set the flags for taking the THEN block */
4202         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4203                    V3D_QPU_PF_PUSHZ);
4204 
4205         /* Jump to ELSE if nothing is active for THEN (unless THEN block is
4206          * so small it won't pay off), otherwise fall through.
4207          */
4208         bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
4209                         is_cheap_block(nir_if_first_then_block(if_stmt));
4210         if (!is_cheap) {
4211                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
4212                 vir_link_blocks(c->cur_block, else_block);
4213         }
4214         vir_link_blocks(c->cur_block, then_block);
4215 
4216         /* Process the THEN block.
4217          *
4218          * Notice we don't call ntq_activate_execute_for_block here on purpose:
4219          * c->execute is already set up to be 0 for lanes that must take the
4220          * THEN block.
4221          */
4222         vir_set_emit_block(c, then_block);
4223         ntq_emit_cf_list(c, &if_stmt->then_list);
4224 
4225         if (!empty_else_block) {
4226                 /* Handle the end of the THEN block.  First, all currently
4227                  * active channels update their execute flags to point to
4228                  * ENDIF
4229                  */
4230                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4231                            V3D_QPU_PF_PUSHZ);
4232                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4233                              vir_uniform_ui(c, after_block->index));
4234 
4235                 /* If everything points at ENDIF, then jump there immediately
4236                  * (unless ELSE block is so small it won't pay off).
4237                  */
4238                 bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
4239                                 is_cheap_block(nir_else_block);
4240                 if (!is_cheap) {
4241                         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4242                                                    c->execute,
4243                                                    vir_uniform_ui(c, after_block->index)),
4244                                    V3D_QPU_PF_PUSHZ);
4245                         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
4246                         vir_link_blocks(c->cur_block, after_block);
4247                 }
4248                 vir_link_blocks(c->cur_block, else_block);
4249 
4250                 vir_set_emit_block(c, else_block);
4251                 ntq_activate_execute_for_block(c);
4252                 ntq_emit_cf_list(c, &if_stmt->else_list);
4253         }
4254 
4255         vir_link_blocks(c->cur_block, after_block);
4256 
4257         vir_set_emit_block(c, after_block);
4258         if (was_uniform_control_flow)
4259                 c->execute = c->undef;
4260         else
4261                 ntq_activate_execute_for_block(c);
4262 }
4263 
4264 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)4265 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
4266 {
4267         bool was_in_control_flow = c->in_control_flow;
4268         c->in_control_flow = true;
4269         if (!vir_in_nonuniform_control_flow(c) &&
4270             !nir_src_is_divergent(nif->condition)) {
4271                 ntq_emit_uniform_if(c, nif);
4272         } else {
4273                 ntq_emit_nonuniform_if(c, nif);
4274         }
4275         c->in_control_flow = was_in_control_flow;
4276 }
4277 
4278 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4279 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4280 {
4281         switch (jump->type) {
4282         case nir_jump_break:
4283                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4284                            V3D_QPU_PF_PUSHZ);
4285                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4286                              vir_uniform_ui(c, c->loop_break_block->index));
4287                 break;
4288 
4289         case nir_jump_continue:
4290                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4291                            V3D_QPU_PF_PUSHZ);
4292                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4293                              vir_uniform_ui(c, c->loop_cont_block->index));
4294                 break;
4295 
4296         case nir_jump_return:
4297                 unreachable("All returns should be lowered\n");
4298                 break;
4299 
4300         case nir_jump_halt:
4301         case nir_jump_goto:
4302         case nir_jump_goto_if:
4303                 unreachable("not supported\n");
4304                 break;
4305         }
4306 }
4307 
4308 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4309 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4310 {
4311         switch (jump->type) {
4312         case nir_jump_break:
4313                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4314                 vir_link_blocks(c->cur_block, c->loop_break_block);
4315                 c->cur_block->branch_emitted = true;
4316                 break;
4317         case nir_jump_continue:
4318                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4319                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4320                 c->cur_block->branch_emitted = true;
4321                 break;
4322 
4323         case nir_jump_return:
4324                 unreachable("All returns should be lowered\n");
4325                 break;
4326 
4327         case nir_jump_halt:
4328         case nir_jump_goto:
4329         case nir_jump_goto_if:
4330                 unreachable("not supported\n");
4331                 break;
4332         }
4333 }
4334 
4335 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4336 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4337 {
4338         switch (instr->type) {
4339         case nir_instr_type_alu:
4340                 ntq_emit_alu(c, nir_instr_as_alu(instr));
4341                 break;
4342 
4343         case nir_instr_type_intrinsic:
4344                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4345                 break;
4346 
4347         case nir_instr_type_load_const:
4348                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4349                 break;
4350 
4351         case nir_instr_type_undef:
4352                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4353                 break;
4354 
4355         case nir_instr_type_tex:
4356                 ntq_emit_tex(c, nir_instr_as_tex(instr));
4357                 break;
4358 
4359         case nir_instr_type_jump:
4360                 /* Always flush TMU before jumping to another block, for the
4361                  * same reasons as in ntq_emit_block.
4362                  */
4363                 ntq_flush_tmu(c);
4364                 if (vir_in_nonuniform_control_flow(c))
4365                         ntq_emit_jump(c, nir_instr_as_jump(instr));
4366                 else
4367                         ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4368                 break;
4369 
4370         default:
4371                 fprintf(stderr, "Unknown NIR instr type: ");
4372                 nir_print_instr(instr, stderr);
4373                 fprintf(stderr, "\n");
4374                 abort();
4375         }
4376 }
4377 
4378 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4379 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4380 {
4381         nir_foreach_instr(instr, block) {
4382                 ntq_emit_instr(c, instr);
4383         }
4384 
4385         /* Always process pending TMU operations in the same block they were
4386          * emitted: we can't emit TMU operations in a block and then emit a
4387          * thread switch and LDTMU/TMUWT for them in another block, possibly
4388          * under control flow.
4389          */
4390         ntq_flush_tmu(c);
4391 }
4392 
4393 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4394 
4395 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4396 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4397 {
4398         bool was_uniform_control_flow = false;
4399         if (!vir_in_nonuniform_control_flow(c)) {
4400                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4401                 was_uniform_control_flow = true;
4402         }
4403 
4404         c->loop_cont_block = vir_new_block(c);
4405         c->loop_break_block = vir_new_block(c);
4406 
4407         vir_link_blocks(c->cur_block, c->loop_cont_block);
4408         vir_set_emit_block(c, c->loop_cont_block);
4409         ntq_activate_execute_for_block(c);
4410 
4411         ntq_emit_cf_list(c, &loop->body);
4412 
4413         /* Re-enable any previous continues now, so our ANYA check below
4414          * works.
4415          *
4416          * XXX: Use the .ORZ flags update, instead.
4417          */
4418         vir_set_pf(c, vir_XOR_dest(c,
4419                                 vir_nop_reg(),
4420                                 c->execute,
4421                                 vir_uniform_ui(c, c->loop_cont_block->index)),
4422                    V3D_QPU_PF_PUSHZ);
4423         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4424 
4425         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4426 
4427         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4428         /* Pixels that were not dispatched or have been discarded should not
4429          * contribute to looping again.
4430          */
4431         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4432         vir_link_blocks(c->cur_block, c->loop_cont_block);
4433         vir_link_blocks(c->cur_block, c->loop_break_block);
4434 
4435         vir_set_emit_block(c, c->loop_break_block);
4436         if (was_uniform_control_flow)
4437                 c->execute = c->undef;
4438         else
4439                 ntq_activate_execute_for_block(c);
4440 }
4441 
4442 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4443 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4444 {
4445         c->loop_cont_block = vir_new_block(c);
4446         c->loop_break_block = vir_new_block(c);
4447 
4448         vir_link_blocks(c->cur_block, c->loop_cont_block);
4449         vir_set_emit_block(c, c->loop_cont_block);
4450 
4451         ntq_emit_cf_list(c, &loop->body);
4452 
4453         if (!c->cur_block->branch_emitted) {
4454                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4455                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4456         }
4457 
4458         vir_set_emit_block(c, c->loop_break_block);
4459 }
4460 
4461 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4462 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4463 {
4464         assert(!nir_loop_has_continue_construct(loop));
4465 
4466         /* Disable flags optimization for loop conditions. The problem here is
4467          * that we can have code like this:
4468          *
4469          *  // block_0
4470          *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
4471          *  loop {
4472          *     // block_1
4473          *     if ssa_9 {
4474          *
4475          * In this example we emit flags to compute ssa_9 and the optimization
4476          * will skip regenerating them again for the loop condition in the
4477          * loop continue block (block_1). However, this is not safe after the
4478          * first iteration because the loop body can stomp the flags if it has
4479          * any conditionals.
4480          */
4481         c->flags_temp = -1;
4482 
4483         bool was_in_control_flow = c->in_control_flow;
4484         c->in_control_flow = true;
4485 
4486         struct qblock *save_loop_cont_block = c->loop_cont_block;
4487         struct qblock *save_loop_break_block = c->loop_break_block;
4488 
4489         if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
4490                 ntq_emit_nonuniform_loop(c, loop);
4491         } else {
4492                 ntq_emit_uniform_loop(c, loop);
4493         }
4494 
4495         c->loop_break_block = save_loop_break_block;
4496         c->loop_cont_block = save_loop_cont_block;
4497 
4498         c->loops++;
4499 
4500         c->in_control_flow = was_in_control_flow;
4501 }
4502 
4503 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4504 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4505 {
4506         fprintf(stderr, "FUNCTIONS not handled.\n");
4507         abort();
4508 }
4509 
4510 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4511 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4512 {
4513         foreach_list_typed(nir_cf_node, node, node, list) {
4514                 switch (node->type) {
4515                 case nir_cf_node_block:
4516                         ntq_emit_block(c, nir_cf_node_as_block(node));
4517                         break;
4518 
4519                 case nir_cf_node_if:
4520                         ntq_emit_if(c, nir_cf_node_as_if(node));
4521                         break;
4522 
4523                 case nir_cf_node_loop:
4524                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
4525                         break;
4526 
4527                 case nir_cf_node_function:
4528                         ntq_emit_function(c, nir_cf_node_as_function(node));
4529                         break;
4530 
4531                 default:
4532                         fprintf(stderr, "Unknown NIR node type\n");
4533                         abort();
4534                 }
4535         }
4536 }
4537 
4538 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4539 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4540 {
4541         ntq_setup_registers(c, impl);
4542         ntq_emit_cf_list(c, &impl->body);
4543 }
4544 
4545 static void
nir_to_vir(struct v3d_compile * c)4546 nir_to_vir(struct v3d_compile *c)
4547 {
4548         switch (c->s->info.stage) {
4549         case MESA_SHADER_FRAGMENT:
4550                 c->start_msf = vir_MSF(c);
4551                 if (c->devinfo->ver < 71)
4552                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4553                 else
4554                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
4555 
4556                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4557                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4558 
4559                 /* V3D 4.x can disable implicit varyings if they are not used */
4560                 c->fs_uses_primitive_id =
4561                         nir_find_variable_with_location(c->s, nir_var_shader_in,
4562                                                         VARYING_SLOT_PRIMITIVE_ID);
4563                 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4564                        c->primitive_id =
4565                                emit_fragment_varying(c, NULL, -1, 0, 0);
4566                 }
4567 
4568                 if (c->fs_key->is_points && program_reads_point_coord(c)) {
4569                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4570                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4571                         c->uses_implicit_point_line_varyings = true;
4572                 } else if (c->fs_key->is_lines &&
4573                            (BITSET_TEST(c->s->info.system_values_read,
4574                                         SYSTEM_VALUE_LINE_COORD))) {
4575                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4576                         c->uses_implicit_point_line_varyings = true;
4577                 }
4578                 break;
4579         case MESA_SHADER_COMPUTE:
4580                 /* Set up the TSO for barriers, assuming we do some. */
4581                 if (c->devinfo->ver < 42) {
4582                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4583                                                       V3D_QPU_WADDR_SYNC));
4584                 }
4585 
4586                 if (c->devinfo->ver == 42) {
4587                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4588                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4589                 } else if (c->devinfo->ver >= 71) {
4590                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
4591                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4592                 }
4593 
4594                 /* Set up the division between gl_LocalInvocationIndex and
4595                  * wg_in_mem in the payload reg.
4596                  */
4597                 int wg_size = (c->s->info.workgroup_size[0] *
4598                                c->s->info.workgroup_size[1] *
4599                                c->s->info.workgroup_size[2]);
4600                 c->local_invocation_index_bits =
4601                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4602                 assert(c->local_invocation_index_bits <= 8);
4603 
4604                 if (c->s->info.shared_size) {
4605                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4606                                                         vir_uniform_ui(c, 16));
4607                         if (c->s->info.workgroup_size[0] != 1 ||
4608                             c->s->info.workgroup_size[1] != 1 ||
4609                             c->s->info.workgroup_size[2] != 1) {
4610                                 int wg_bits = (16 -
4611                                                c->local_invocation_index_bits);
4612                                 int wg_mask = (1 << wg_bits) - 1;
4613                                 wg_in_mem = vir_AND(c, wg_in_mem,
4614                                                     vir_uniform_ui(c, wg_mask));
4615                         }
4616                         struct qreg shared_per_wg =
4617                                 vir_uniform_ui(c, c->s->info.shared_size);
4618 
4619                         c->cs_shared_offset =
4620                                 vir_ADD(c,
4621                                         vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4622                                         vir_UMUL(c, wg_in_mem, shared_per_wg));
4623                 }
4624                 break;
4625         default:
4626                 break;
4627         }
4628 
4629         if (c->s->scratch_size) {
4630                 v3d_setup_spill_base(c);
4631                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4632         }
4633 
4634         switch (c->s->info.stage) {
4635         case MESA_SHADER_VERTEX:
4636                 ntq_setup_vs_inputs(c);
4637                 break;
4638         case MESA_SHADER_GEOMETRY:
4639                 ntq_setup_gs_inputs(c);
4640                 break;
4641         case MESA_SHADER_FRAGMENT:
4642                 ntq_setup_fs_inputs(c);
4643                 break;
4644         case MESA_SHADER_COMPUTE:
4645                 break;
4646         default:
4647                 unreachable("unsupported shader stage");
4648         }
4649 
4650         ntq_setup_outputs(c);
4651 
4652         /* Find the main function and emit the body. */
4653         nir_foreach_function(function, c->s) {
4654                 assert(function->is_entrypoint);
4655                 assert(function->impl);
4656                 ntq_emit_impl(c, function->impl);
4657         }
4658 }
4659 
4660 /**
4661  * When demoting a shader down to single-threaded, removes the THRSW
4662  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4663  * program end).
4664  */
4665 static void
vir_remove_thrsw(struct v3d_compile * c)4666 vir_remove_thrsw(struct v3d_compile *c)
4667 {
4668         vir_for_each_block(block, c) {
4669                 vir_for_each_inst_safe(inst, block) {
4670                         if (inst->qpu.sig.thrsw)
4671                                 vir_remove_instruction(c, inst);
4672                 }
4673         }
4674 
4675         c->last_thrsw = NULL;
4676 }
4677 
4678 /**
4679  * This makes sure we have a top-level last thread switch which signals the
4680  * start of the last thread section, which may include adding a new thrsw
4681  * instruction if needed. We don't allow spilling in the last thread section, so
4682  * if we need to do any spills that inject additional thread switches later on,
4683  * we ensure this thread switch will still be the last thread switch in the
4684  * program, which makes last thread switch signalling a lot easier when we have
4685  * spilling. If in the end we don't need to spill to compile the program and we
4686  * injected a new thread switch instruction here only for that, we will
4687  * eventually restore the previous last thread switch and remove the one we
4688  * added here.
4689  */
4690 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4691 vir_emit_last_thrsw(struct v3d_compile *c,
4692                     struct qinst **restore_last_thrsw,
4693                     bool *restore_scoreboard_lock)
4694 {
4695         *restore_last_thrsw = c->last_thrsw;
4696 
4697         /* If we're threaded and the last THRSW was in conditional code, then
4698          * we need to emit another one so that we can flag it as the last
4699          * thrsw.
4700          */
4701         if (c->last_thrsw && !c->last_thrsw_at_top_level)
4702                 vir_emit_thrsw(c);
4703 
4704         /* If we're threaded, then we need to mark the last THRSW instruction
4705          * so we can emit a pair of them at QPU emit time.
4706          *
4707          * For V3D 4.x, we can spawn the non-fragment shaders already in the
4708          * post-last-THRSW state, so we can skip this.
4709          */
4710         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
4711                 vir_emit_thrsw(c);
4712 
4713         /* If we have not inserted a last thread switch yet, do it now to ensure
4714          * any potential spilling we do happens before this. If we don't spill
4715          * in the end, we will restore the previous one.
4716          */
4717         if (*restore_last_thrsw == c->last_thrsw) {
4718                 if (*restore_last_thrsw)
4719                         (*restore_last_thrsw)->is_last_thrsw = false;
4720                 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4721                 vir_emit_thrsw(c);
4722         } else {
4723                 *restore_last_thrsw = c->last_thrsw;
4724         }
4725 
4726         assert(c->last_thrsw);
4727         c->last_thrsw->is_last_thrsw = true;
4728 }
4729 
4730 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4731 vir_restore_last_thrsw(struct v3d_compile *c,
4732                        struct qinst *thrsw,
4733                        bool scoreboard_lock)
4734 {
4735         assert(c->last_thrsw);
4736         vir_remove_instruction(c, c->last_thrsw);
4737         c->last_thrsw = thrsw;
4738         if (c->last_thrsw)
4739                 c->last_thrsw->is_last_thrsw = true;
4740         c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4741 }
4742 
4743 /* There's a flag in the shader for "center W is needed for reasons other than
4744  * non-centroid varyings", so we just walk the program after VIR optimization
4745  * to see if it's used.  It should be harmless to set even if we only use
4746  * center W for varyings.
4747  */
4748 static void
vir_check_payload_w(struct v3d_compile * c)4749 vir_check_payload_w(struct v3d_compile *c)
4750 {
4751         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4752                 return;
4753 
4754         vir_for_each_inst_inorder(inst, c) {
4755                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4756                         if (inst->src[i].file == c->payload_w.file &&
4757                             inst->src[i].index == c->payload_w.index) {
4758                                 c->uses_center_w = true;
4759                                 return;
4760                         }
4761                 }
4762         }
4763 }
4764 
4765 void
v3d_nir_to_vir(struct v3d_compile * c)4766 v3d_nir_to_vir(struct v3d_compile *c)
4767 {
4768         if (V3D_DBG(NIR) ||
4769             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4770                 fprintf(stderr, "%s prog %d/%d NIR:\n",
4771                         vir_get_stage_name(c),
4772                         c->program_id, c->variant_id);
4773                 nir_print_shader(c->s, stderr);
4774         }
4775 
4776         nir_to_vir(c);
4777 
4778         bool restore_scoreboard_lock = false;
4779         struct qinst *restore_last_thrsw;
4780 
4781         /* Emit the last THRSW before STVPM and TLB writes. */
4782         vir_emit_last_thrsw(c,
4783                             &restore_last_thrsw,
4784                             &restore_scoreboard_lock);
4785 
4786 
4787         switch (c->s->info.stage) {
4788         case MESA_SHADER_FRAGMENT:
4789                 emit_frag_end(c);
4790                 break;
4791         case MESA_SHADER_GEOMETRY:
4792                 emit_geom_end(c);
4793                 break;
4794         case MESA_SHADER_VERTEX:
4795                 emit_vert_end(c);
4796                 break;
4797         case MESA_SHADER_COMPUTE:
4798                 break;
4799         default:
4800                 unreachable("bad stage");
4801         }
4802 
4803         if (V3D_DBG(VIR) ||
4804             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4805                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4806                         vir_get_stage_name(c),
4807                         c->program_id, c->variant_id);
4808                 vir_dump(c);
4809                 fprintf(stderr, "\n");
4810         }
4811 
4812         vir_optimize(c);
4813 
4814         vir_check_payload_w(c);
4815 
4816         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4817          * We used that on that platform to pipeline TMU writes and reduce the
4818          * number of thread switches, as well as try (mostly successfully) to
4819          * reduce maximum register pressure to allow more threads.  We should
4820          * do something of that sort for V3D -- either instruction scheduling
4821          * here, or delay the the THRSW and LDTMUs from our texture
4822          * instructions until the results are needed.
4823          */
4824 
4825         if (V3D_DBG(VIR) ||
4826             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4827                 fprintf(stderr, "%s prog %d/%d VIR:\n",
4828                         vir_get_stage_name(c),
4829                         c->program_id, c->variant_id);
4830                 vir_dump(c);
4831                 fprintf(stderr, "\n");
4832         }
4833 
4834         /* Attempt to allocate registers for the temporaries.  If we fail,
4835          * reduce thread count and try again.
4836          */
4837         int min_threads = 2;
4838         struct qpu_reg *temp_registers;
4839         while (true) {
4840                 temp_registers = v3d_register_allocate(c);
4841                 if (temp_registers) {
4842                         assert(c->spills + c->fills <= c->max_tmu_spills);
4843                         break;
4844                 }
4845 
4846                 if (c->threads == min_threads &&
4847                     V3D_DBG(RA)) {
4848                         fprintf(stderr,
4849                                 "Failed to register allocate using %s\n",
4850                                 c->fallback_scheduler ? "the fallback scheduler:" :
4851                                 "the normal scheduler: \n");
4852 
4853                         vir_dump(c);
4854 
4855                         char *shaderdb;
4856                         int ret = v3d_shaderdb_dump(c, &shaderdb);
4857                         if (ret > 0) {
4858                                 fprintf(stderr, "%s\n", shaderdb);
4859                                 free(shaderdb);
4860                         }
4861                 }
4862 
4863                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
4864                         if (V3D_DBG(PERF)) {
4865                                 fprintf(stderr,
4866                                         "Failed to register allocate %s "
4867                                         "prog %d/%d at %d threads.\n",
4868                                         vir_get_stage_name(c),
4869                                         c->program_id, c->variant_id, c->threads);
4870                         }
4871                         c->compilation_result =
4872                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
4873                         return;
4874                 }
4875 
4876                 c->spills = 0;
4877                 c->fills = 0;
4878                 c->threads /= 2;
4879 
4880                 if (c->threads == 1)
4881                         vir_remove_thrsw(c);
4882         }
4883 
4884         /* If we didn't spill, then remove the last thread switch we injected
4885          * artificially (if any) and restore the previous one.
4886          */
4887         if (!c->spills && c->last_thrsw != restore_last_thrsw)
4888                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
4889 
4890         if (c->spills &&
4891             (V3D_DBG(VIR) ||
4892              v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4893                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
4894                         vir_get_stage_name(c),
4895                         c->program_id, c->variant_id);
4896                 vir_dump(c);
4897                 fprintf(stderr, "\n");
4898         }
4899 
4900         v3d_vir_to_qpu(c, temp_registers);
4901 }
4902