1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v41_pack.h"
42
43 #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
53
54 #define V3D_TSY_SET_QUORUM 0
55 #define V3D_TSY_INC_WAITERS 1
56 #define V3D_TSY_DEC_WAITERS 2
57 #define V3D_TSY_INC_QUORUM 3
58 #define V3D_TSY_DEC_QUORUM 4
59 #define V3D_TSY_FREE_ALL 5
60 #define V3D_TSY_RELEASE 6
61 #define V3D_TSY_ACQUIRE 7
62 #define V3D_TSY_WAIT 8
63 #define V3D_TSY_WAIT_INC 9
64 #define V3D_TSY_WAIT_CHECK 10
65 #define V3D_TSY_WAIT_INC_CHECK 11
66 #define V3D_TSY_WAIT_CV 12
67 #define V3D_TSY_INC_SEMAPHORE 13
68 #define V3D_TSY_DEC_SEMAPHORE 14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70
71 static void
72 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
73
74 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)75 resize_qreg_array(struct v3d_compile *c,
76 struct qreg **regs,
77 uint32_t *size,
78 uint32_t decl_size)
79 {
80 if (*size >= decl_size)
81 return;
82
83 uint32_t old_size = *size;
84 *size = MAX2(*size * 2, decl_size);
85 *regs = reralloc(c, *regs, struct qreg, *size);
86 if (!*regs) {
87 fprintf(stderr, "Malloc failure\n");
88 abort();
89 }
90
91 for (uint32_t i = old_size; i < *size; i++)
92 (*regs)[i] = c->undef;
93 }
94
95 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)96 resize_interp_array(struct v3d_compile *c,
97 struct v3d_interp_input **regs,
98 uint32_t *size,
99 uint32_t decl_size)
100 {
101 if (*size >= decl_size)
102 return;
103
104 uint32_t old_size = *size;
105 *size = MAX2(*size * 2, decl_size);
106 *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
107 if (!*regs) {
108 fprintf(stderr, "Malloc failure\n");
109 abort();
110 }
111
112 for (uint32_t i = old_size; i < *size; i++) {
113 (*regs)[i].vp = c->undef;
114 (*regs)[i].C = c->undef;
115 }
116 }
117
118 void
vir_emit_thrsw(struct v3d_compile * c)119 vir_emit_thrsw(struct v3d_compile *c)
120 {
121 if (c->threads == 1)
122 return;
123
124 /* Always thread switch after each texture operation for now.
125 *
126 * We could do better by batching a bunch of texture fetches up and
127 * then doing one thread switch and collecting all their results
128 * afterward.
129 */
130 c->last_thrsw = vir_NOP(c);
131 c->last_thrsw->qpu.sig.thrsw = true;
132 c->last_thrsw_at_top_level = !c->in_control_flow;
133
134 /* We need to lock the scoreboard before any tlb acess happens. If this
135 * thread switch comes after we have emitted a tlb load, then it means
136 * that we can't lock on the last thread switch any more.
137 */
138 if (c->emitted_tlb_load)
139 c->lock_scoreboard_on_first_thrsw = true;
140 }
141
142 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)143 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
144 {
145 if (nir_src_is_const(instr->src[src])) {
146 int64_t add_val = nir_src_as_int(instr->src[src]);
147 if (add_val == 1)
148 return V3D_TMU_OP_WRITE_AND_READ_INC;
149 else if (add_val == -1)
150 return V3D_TMU_OP_WRITE_OR_READ_DEC;
151 }
152
153 return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
154 }
155
156 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)157 v3d_general_tmu_op(nir_intrinsic_instr *instr)
158 {
159 switch (instr->intrinsic) {
160 case nir_intrinsic_load_ssbo:
161 case nir_intrinsic_load_ubo:
162 case nir_intrinsic_load_uniform:
163 case nir_intrinsic_load_shared:
164 case nir_intrinsic_load_scratch:
165 case nir_intrinsic_store_ssbo:
166 case nir_intrinsic_store_shared:
167 case nir_intrinsic_store_scratch:
168 return V3D_TMU_OP_REGULAR;
169 case nir_intrinsic_ssbo_atomic_add:
170 return v3d_get_op_for_atomic_add(instr, 2);
171 case nir_intrinsic_shared_atomic_add:
172 return v3d_get_op_for_atomic_add(instr, 1);
173 case nir_intrinsic_ssbo_atomic_imin:
174 case nir_intrinsic_shared_atomic_imin:
175 return V3D_TMU_OP_WRITE_SMIN;
176 case nir_intrinsic_ssbo_atomic_umin:
177 case nir_intrinsic_shared_atomic_umin:
178 return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
179 case nir_intrinsic_ssbo_atomic_imax:
180 case nir_intrinsic_shared_atomic_imax:
181 return V3D_TMU_OP_WRITE_SMAX;
182 case nir_intrinsic_ssbo_atomic_umax:
183 case nir_intrinsic_shared_atomic_umax:
184 return V3D_TMU_OP_WRITE_UMAX;
185 case nir_intrinsic_ssbo_atomic_and:
186 case nir_intrinsic_shared_atomic_and:
187 return V3D_TMU_OP_WRITE_AND_READ_INC;
188 case nir_intrinsic_ssbo_atomic_or:
189 case nir_intrinsic_shared_atomic_or:
190 return V3D_TMU_OP_WRITE_OR_READ_DEC;
191 case nir_intrinsic_ssbo_atomic_xor:
192 case nir_intrinsic_shared_atomic_xor:
193 return V3D_TMU_OP_WRITE_XOR_READ_NOT;
194 case nir_intrinsic_ssbo_atomic_exchange:
195 case nir_intrinsic_shared_atomic_exchange:
196 return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
197 case nir_intrinsic_ssbo_atomic_comp_swap:
198 case nir_intrinsic_shared_atomic_comp_swap:
199 return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
200 default:
201 unreachable("unknown intrinsic op");
202 }
203 }
204
205 /**
206 * Implements indirect uniform loads and SSBO accesses through the TMU general
207 * memory access interface.
208 */
209 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch)210 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
211 bool is_shared_or_scratch)
212 {
213 uint32_t tmu_op = v3d_general_tmu_op(instr);
214
215 /* If we were able to replace atomic_add for an inc/dec, then we
216 * need/can to do things slightly different, like not loading the
217 * amount to add/sub, as that is implicit.
218 */
219 bool atomic_add_replaced =
220 ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
221 instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
222 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
223 tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
224
225 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
226 instr->intrinsic == nir_intrinsic_store_scratch ||
227 instr->intrinsic == nir_intrinsic_store_shared);
228
229 bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
230 instr->intrinsic == nir_intrinsic_load_ubo ||
231 instr->intrinsic == nir_intrinsic_load_ssbo ||
232 instr->intrinsic == nir_intrinsic_load_scratch ||
233 instr->intrinsic == nir_intrinsic_load_shared);
234
235 if (!is_load)
236 c->tmu_dirty_rcl = true;
237
238 bool has_index = !is_shared_or_scratch;
239
240 int offset_src;
241 if (instr->intrinsic == nir_intrinsic_load_uniform) {
242 offset_src = 0;
243 } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
244 instr->intrinsic == nir_intrinsic_load_ubo ||
245 instr->intrinsic == nir_intrinsic_load_scratch ||
246 instr->intrinsic == nir_intrinsic_load_shared ||
247 atomic_add_replaced) {
248 offset_src = 0 + has_index;
249 } else if (is_store) {
250 offset_src = 1 + has_index;
251 } else {
252 offset_src = 0 + has_index;
253 }
254
255 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
256 uint32_t const_offset = 0;
257 if (!dynamic_src)
258 const_offset = nir_src_as_uint(instr->src[offset_src]);
259
260 struct qreg base_offset;
261 if (instr->intrinsic == nir_intrinsic_load_uniform) {
262 const_offset += nir_intrinsic_base(instr);
263 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
264 v3d_unit_data_create(0, const_offset));
265 const_offset = 0;
266 } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
267 uint32_t index = nir_src_as_uint(instr->src[0]);
268 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
269 * shifted up by 1 (0 is gallium's constant buffer 0).
270 */
271 if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
272 index++;
273
274 base_offset =
275 vir_uniform(c, QUNIFORM_UBO_ADDR,
276 v3d_unit_data_create(index, const_offset));
277 const_offset = 0;
278 } else if (is_shared_or_scratch) {
279 /* Shared and scratch variables have no buffer index, and all
280 * start from a common base that we set up at the start of
281 * dispatch.
282 */
283 if (instr->intrinsic == nir_intrinsic_load_scratch ||
284 instr->intrinsic == nir_intrinsic_store_scratch) {
285 base_offset = c->spill_base;
286 } else {
287 base_offset = c->cs_shared_offset;
288 const_offset += nir_intrinsic_base(instr);
289 }
290 } else {
291 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
292 nir_src_as_uint(instr->src[is_store ?
293 1 : 0]));
294 }
295
296 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
297 unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
298 uint32_t base_const_offset = const_offset;
299 int first_component = -1;
300 int last_component = -1;
301 do {
302 int tmu_writes = 1; /* address */
303
304 if (is_store) {
305 /* Find the first set of consecutive components that
306 * are enabled in the writemask and emit the TMUD
307 * instructions for them.
308 */
309 first_component = ffs(writemask) - 1;
310 last_component = first_component;
311 while (writemask & BITFIELD_BIT(last_component + 1))
312 last_component++;
313
314 assert(first_component >= 0 &&
315 first_component <= last_component &&
316 last_component < instr->num_components);
317
318 struct qreg tmud = vir_reg(QFILE_MAGIC,
319 V3D_QPU_WADDR_TMUD);
320 for (int i = first_component; i <= last_component; i++) {
321 struct qreg data =
322 ntq_get_src(c, instr->src[0], i);
323 vir_MOV_dest(c, tmud, data);
324 tmu_writes++;
325 }
326
327 /* Update the offset for the TMU write based on the
328 * the first component we are writing.
329 */
330 const_offset = base_const_offset + first_component * 4;
331
332 /* Clear these components from the writemask */
333 uint32_t written_mask =
334 BITFIELD_RANGE(first_component, tmu_writes - 1);
335 writemask &= ~written_mask;
336 } else if (!is_load && !atomic_add_replaced) {
337 struct qreg data =
338 ntq_get_src(c, instr->src[1 + has_index], 0);
339 vir_MOV_dest(c, tmud, data);
340 tmu_writes++;
341 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
342 data = ntq_get_src(c, instr->src[2 + has_index],
343 0);
344 vir_MOV_dest(c, tmud, data);
345 tmu_writes++;
346 }
347 }
348
349 /* Make sure we won't exceed the 16-entry TMU fifo if each
350 * thread is storing at the same time.
351 */
352 while (tmu_writes > 16 / c->threads)
353 c->threads /= 2;
354
355 /* The spec says that for atomics, the TYPE field is ignored,
356 * but that doesn't seem to be the case for CMPXCHG. Just use
357 * the number of tmud writes we did to decide the type (or
358 * choose "32bit" for atomic reads, which has been fine).
359 */
360 uint32_t num_components;
361 if (is_load || atomic_add_replaced) {
362 num_components = instr->num_components;
363 } else {
364 assert(tmu_writes > 1);
365 num_components = tmu_writes - 1;
366 }
367
368 uint32_t perquad = is_load
369 ? GENERAL_TMU_LOOKUP_PER_QUAD
370 : GENERAL_TMU_LOOKUP_PER_PIXEL;
371 uint32_t config = (0xffffff00 |
372 tmu_op << 3|
373 perquad);
374 if (num_components == 1) {
375 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
376 } else {
377 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
378 num_components - 2;
379 }
380
381 if (vir_in_nonuniform_control_flow(c)) {
382 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
383 V3D_QPU_PF_PUSHZ);
384 }
385
386 struct qreg tmua;
387 if (config == ~0)
388 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
389 else
390 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
391
392 struct qinst *tmu;
393 if (dynamic_src) {
394 struct qreg offset = base_offset;
395 if (const_offset != 0) {
396 offset = vir_ADD(c, offset,
397 vir_uniform_ui(c, const_offset));
398 }
399 struct qreg data =
400 ntq_get_src(c, instr->src[offset_src], 0);
401 tmu = vir_ADD_dest(c, tmua, offset, data);
402 } else {
403 if (const_offset != 0) {
404 tmu = vir_ADD_dest(c, tmua, base_offset,
405 vir_uniform_ui(c, const_offset));
406 } else {
407 tmu = vir_MOV_dest(c, tmua, base_offset);
408 }
409 }
410
411 if (config != ~0) {
412 tmu->uniform =
413 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
414 config);
415 }
416
417 if (vir_in_nonuniform_control_flow(c))
418 vir_set_cond(tmu, V3D_QPU_COND_IFA);
419
420 vir_emit_thrsw(c);
421
422 /* Read the result, or wait for the TMU op to complete. */
423 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
424 ntq_store_dest(c, &instr->dest, i,
425 vir_MOV(c, vir_LDTMU(c)));
426 }
427
428 if (nir_intrinsic_dest_components(instr) == 0)
429 vir_TMUWT(c);
430 } while (is_store && writemask != 0);
431 }
432
433 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_ssa_def * def)434 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
435 {
436 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
437 def->num_components);
438 _mesa_hash_table_insert(c->def_ht, def, qregs);
439 return qregs;
440 }
441
442 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)443 is_ld_signal(const struct v3d_qpu_sig *sig)
444 {
445 return (sig->ldunif ||
446 sig->ldunifa ||
447 sig->ldunifrf ||
448 sig->ldunifarf ||
449 sig->ldtmu ||
450 sig->ldvary ||
451 sig->ldvpm ||
452 sig->ldtlb ||
453 sig->ldtlbu);
454 }
455
456 /**
457 * This function is responsible for getting VIR results into the associated
458 * storage for a NIR instruction.
459 *
460 * If it's a NIR SSA def, then we just set the associated hash table entry to
461 * the new result.
462 *
463 * If it's a NIR reg, then we need to update the existing qreg assigned to the
464 * NIR destination with the incoming value. To do that without introducing
465 * new MOVs, we require that the incoming qreg either be a uniform, or be
466 * SSA-defined by the previous VIR instruction in the block and rewritable by
467 * this function. That lets us sneak ahead and insert the SF flag beforehand
468 * (knowing that the previous instruction doesn't depend on flags) and rewrite
469 * its destination to be the NIR reg's destination
470 */
471 void
ntq_store_dest(struct v3d_compile * c,nir_dest * dest,int chan,struct qreg result)472 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
473 struct qreg result)
474 {
475 struct qinst *last_inst = NULL;
476 if (!list_is_empty(&c->cur_block->instructions))
477 last_inst = (struct qinst *)c->cur_block->instructions.prev;
478
479 assert((result.file == QFILE_TEMP &&
480 last_inst && last_inst == c->defs[result.index]));
481
482 if (dest->is_ssa) {
483 assert(chan < dest->ssa.num_components);
484
485 struct qreg *qregs;
486 struct hash_entry *entry =
487 _mesa_hash_table_search(c->def_ht, &dest->ssa);
488
489 if (entry)
490 qregs = entry->data;
491 else
492 qregs = ntq_init_ssa_def(c, &dest->ssa);
493
494 qregs[chan] = result;
495 } else {
496 nir_register *reg = dest->reg.reg;
497 assert(dest->reg.base_offset == 0);
498 assert(reg->num_array_elems == 0);
499 struct hash_entry *entry =
500 _mesa_hash_table_search(c->def_ht, reg);
501 struct qreg *qregs = entry->data;
502
503 /* If the previous instruction can't be predicated for
504 * the store into the nir_register, then emit a MOV
505 * that can be.
506 */
507 if (vir_in_nonuniform_control_flow(c) &&
508 is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) {
509 result = vir_MOV(c, result);
510 last_inst = c->defs[result.index];
511 }
512
513 /* We know they're both temps, so just rewrite index. */
514 c->defs[last_inst->dst.index] = NULL;
515 last_inst->dst.index = qregs[chan].index;
516
517 /* If we're in control flow, then make this update of the reg
518 * conditional on the execution mask.
519 */
520 if (vir_in_nonuniform_control_flow(c)) {
521 last_inst->dst.index = qregs[chan].index;
522
523 /* Set the flags to the current exec mask.
524 */
525 c->cursor = vir_before_inst(last_inst);
526 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
527 V3D_QPU_PF_PUSHZ);
528 c->cursor = vir_after_inst(last_inst);
529
530 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
531 }
532 }
533 }
534
535 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)536 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
537 {
538 struct hash_entry *entry;
539 if (src.is_ssa) {
540 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
541 assert(i < src.ssa->num_components);
542 } else {
543 nir_register *reg = src.reg.reg;
544 entry = _mesa_hash_table_search(c->def_ht, reg);
545 assert(reg->num_array_elems == 0);
546 assert(src.reg.base_offset == 0);
547 assert(i < reg->num_components);
548 }
549
550 struct qreg *qregs = entry->data;
551 return qregs[i];
552 }
553
554 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)555 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
556 unsigned src)
557 {
558 assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
559 unsigned chan = ffs(instr->dest.write_mask) - 1;
560 struct qreg r = ntq_get_src(c, instr->src[src].src,
561 instr->src[src].swizzle[chan]);
562
563 assert(!instr->src[src].abs);
564 assert(!instr->src[src].negate);
565
566 return r;
567 };
568
569 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)570 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
571 {
572 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
573 }
574
575 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)576 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
577 {
578 unsigned unit = instr->texture_index;
579 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
580 int dest_size = nir_tex_instr_dest_size(instr);
581
582 struct qreg lod = c->undef;
583 if (lod_index != -1)
584 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
585
586 for (int i = 0; i < dest_size; i++) {
587 assert(i < 3);
588 enum quniform_contents contents;
589
590 if (instr->is_array && i == dest_size - 1)
591 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
592 else
593 contents = QUNIFORM_TEXTURE_WIDTH + i;
594
595 struct qreg size = vir_uniform(c, contents, unit);
596
597 switch (instr->sampler_dim) {
598 case GLSL_SAMPLER_DIM_1D:
599 case GLSL_SAMPLER_DIM_2D:
600 case GLSL_SAMPLER_DIM_MS:
601 case GLSL_SAMPLER_DIM_3D:
602 case GLSL_SAMPLER_DIM_CUBE:
603 case GLSL_SAMPLER_DIM_BUF:
604 /* Don't minify the array size. */
605 if (!(instr->is_array && i == dest_size - 1)) {
606 size = ntq_minify(c, size, lod);
607 }
608 break;
609
610 case GLSL_SAMPLER_DIM_RECT:
611 /* There's no LOD field for rects */
612 break;
613
614 default:
615 unreachable("Bad sampler type");
616 }
617
618 ntq_store_dest(c, &instr->dest, i, size);
619 }
620 }
621
622 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)623 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
624 {
625 unsigned unit = instr->texture_index;
626
627 /* Since each texture sampling op requires uploading uniforms to
628 * reference the texture, there's no HW support for texture size and
629 * you just upload uniforms containing the size.
630 */
631 switch (instr->op) {
632 case nir_texop_query_levels:
633 ntq_store_dest(c, &instr->dest, 0,
634 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
635 return;
636 case nir_texop_texture_samples:
637 ntq_store_dest(c, &instr->dest, 0,
638 vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
639 return;
640 case nir_texop_txs:
641 ntq_emit_txs(c, instr);
642 return;
643 default:
644 break;
645 }
646
647 if (c->devinfo->ver >= 40)
648 v3d40_vir_emit_tex(c, instr);
649 else
650 v3d33_vir_emit_tex(c, instr);
651 }
652
653 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)654 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
655 {
656 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
657 if (is_cos)
658 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
659
660 struct qreg periods = vir_FROUND(c, input);
661 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
662 return vir_XOR(c, sin_output, vir_SHL(c,
663 vir_FTOIN(c, periods),
664 vir_uniform_ui(c, -1)));
665 }
666
667 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)668 ntq_fsign(struct v3d_compile *c, struct qreg src)
669 {
670 struct qreg t = vir_get_temp(c);
671
672 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
673 vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
674 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
675 vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
676 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
677 return vir_MOV(c, t);
678 }
679
680 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)681 emit_fragcoord_input(struct v3d_compile *c, int attr)
682 {
683 c->inputs[attr * 4 + 0] = vir_FXCD(c);
684 c->inputs[attr * 4 + 1] = vir_FYCD(c);
685 c->inputs[attr * 4 + 2] = c->payload_z;
686 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
687 }
688
689 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)690 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
691 int8_t input_idx, uint8_t swizzle, int array_index)
692 {
693 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
694 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
695
696 struct qreg vary;
697 if (c->devinfo->ver >= 41) {
698 struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
699 c->undef, c->undef);
700 ldvary->qpu.sig.ldvary = true;
701 vary = vir_emit_def(c, ldvary);
702 } else {
703 vir_NOP(c)->qpu.sig.ldvary = true;
704 vary = r3;
705 }
706
707 /* Store the input value before interpolation so we can implement
708 * GLSL's interpolateAt functions if the shader uses them.
709 */
710 if (input_idx >= 0) {
711 assert(var);
712 c->interp[input_idx].vp = vary;
713 c->interp[input_idx].C = vir_MOV(c, r5);
714 c->interp[input_idx].mode = var->data.interpolation;
715 }
716
717 /* For gl_PointCoord input or distance along a line, we'll be called
718 * with no nir_variable, and we don't count toward VPM size so we
719 * don't track an input slot.
720 */
721 if (!var) {
722 assert(input_idx < 0);
723 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
724 }
725
726 int i = c->num_inputs++;
727 c->input_slots[i] =
728 v3d_slot_from_slot_and_component(var->data.location +
729 array_index, swizzle);
730
731 struct qreg result;
732 switch (var->data.interpolation) {
733 case INTERP_MODE_NONE:
734 /* If a gl_FrontColor or gl_BackColor input has no interp
735 * qualifier, then if we're using glShadeModel(GL_FLAT) it
736 * needs to be flat shaded.
737 */
738 switch (var->data.location + array_index) {
739 case VARYING_SLOT_COL0:
740 case VARYING_SLOT_COL1:
741 case VARYING_SLOT_BFC0:
742 case VARYING_SLOT_BFC1:
743 if (c->fs_key->shade_model_flat) {
744 BITSET_SET(c->flat_shade_flags, i);
745 vir_MOV_dest(c, c->undef, vary);
746 result = vir_MOV(c, r5);
747 } else {
748 result = vir_FADD(c, vir_FMUL(c, vary,
749 c->payload_w), r5);
750 }
751 goto done;
752 default:
753 break;
754 }
755 /* FALLTHROUGH */
756 case INTERP_MODE_SMOOTH:
757 if (var->data.centroid) {
758 BITSET_SET(c->centroid_flags, i);
759 result = vir_FADD(c, vir_FMUL(c, vary,
760 c->payload_w_centroid), r5);
761 } else {
762 result = vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
763 }
764 break;
765
766 case INTERP_MODE_NOPERSPECTIVE:
767 BITSET_SET(c->noperspective_flags, i);
768 result = vir_FADD(c, vir_MOV(c, vary), r5);
769 break;
770
771 case INTERP_MODE_FLAT:
772 BITSET_SET(c->flat_shade_flags, i);
773 vir_MOV_dest(c, c->undef, vary);
774 result = vir_MOV(c, r5);
775 break;
776
777 default:
778 unreachable("Bad interp mode");
779 }
780
781 done:
782 if (input_idx >= 0)
783 c->inputs[input_idx] = result;
784 return result;
785 }
786
787 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)788 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
789 int array_index, unsigned nelem)
790 {
791 for (int i = 0; i < nelem ; i++) {
792 int chan = var->data.location_frac + i;
793 int input_idx = (base_attr + array_index) * 4 + chan;
794 emit_fragment_varying(c, var, input_idx, chan, array_index);
795 }
796 }
797
798 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)799 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
800 int array_index)
801 {
802 /* Compact variables are scalar arrays where each set of 4 elements
803 * consumes a single location.
804 */
805 int loc_offset = array_index / 4;
806 int chan = var->data.location_frac + array_index % 4;
807 int input_idx = (attr + loc_offset) * 4 + chan;
808 emit_fragment_varying(c, var, input_idx, chan, loc_offset);
809 }
810
811 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)812 add_output(struct v3d_compile *c,
813 uint32_t decl_offset,
814 uint8_t slot,
815 uint8_t swizzle)
816 {
817 uint32_t old_array_size = c->outputs_array_size;
818 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
819 decl_offset + 1);
820
821 if (old_array_size != c->outputs_array_size) {
822 c->output_slots = reralloc(c,
823 c->output_slots,
824 struct v3d_varying_slot,
825 c->outputs_array_size);
826 }
827
828 c->output_slots[decl_offset] =
829 v3d_slot_from_slot_and_component(slot, swizzle);
830 }
831
832 /**
833 * If compare_instr is a valid comparison instruction, emits the
834 * compare_instr's comparison and returns the sel_instr's return value based
835 * on the compare_instr's result.
836 */
837 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)838 ntq_emit_comparison(struct v3d_compile *c,
839 nir_alu_instr *compare_instr,
840 enum v3d_qpu_cond *out_cond)
841 {
842 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
843 struct qreg src1;
844 if (nir_op_infos[compare_instr->op].num_inputs > 1)
845 src1 = ntq_get_alu_src(c, compare_instr, 1);
846 bool cond_invert = false;
847 struct qreg nop = vir_nop_reg();
848
849 switch (compare_instr->op) {
850 case nir_op_feq32:
851 case nir_op_seq:
852 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
853 break;
854 case nir_op_ieq32:
855 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
856 break;
857
858 case nir_op_fneu32:
859 case nir_op_sne:
860 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
861 cond_invert = true;
862 break;
863 case nir_op_ine32:
864 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
865 cond_invert = true;
866 break;
867
868 case nir_op_fge32:
869 case nir_op_sge:
870 vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
871 break;
872 case nir_op_ige32:
873 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
874 cond_invert = true;
875 break;
876 case nir_op_uge32:
877 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
878 cond_invert = true;
879 break;
880
881 case nir_op_slt:
882 case nir_op_flt32:
883 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
884 break;
885 case nir_op_ilt32:
886 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
887 break;
888 case nir_op_ult32:
889 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
890 break;
891
892 case nir_op_i2b32:
893 vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
894 cond_invert = true;
895 break;
896
897 case nir_op_f2b32:
898 vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
899 cond_invert = true;
900 break;
901
902 default:
903 return false;
904 }
905
906 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
907
908 return true;
909 }
910
911 /* Finds an ALU instruction that generates our src value that could
912 * (potentially) be greedily emitted in the consuming instruction.
913 */
914 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)915 ntq_get_alu_parent(nir_src src)
916 {
917 if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
918 return NULL;
919 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
920 if (!instr)
921 return NULL;
922
923 /* If the ALU instr's srcs are non-SSA, then we would have to avoid
924 * moving emission of the ALU instr down past another write of the
925 * src.
926 */
927 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
928 if (!instr->src[i].src.is_ssa)
929 return NULL;
930 }
931
932 return instr;
933 }
934
935 /* Turns a NIR bool into a condition code to predicate on. */
936 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)937 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
938 {
939 nir_alu_instr *compare = ntq_get_alu_parent(src);
940 if (!compare)
941 goto out;
942
943 enum v3d_qpu_cond cond;
944 if (ntq_emit_comparison(c, compare, &cond))
945 return cond;
946
947 out:
948 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
949 V3D_QPU_PF_PUSHZ);
950 return V3D_QPU_COND_IFNA;
951 }
952
953 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)954 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
955 {
956 /* This should always be lowered to ALU operations for V3D. */
957 assert(!instr->dest.saturate);
958
959 /* Vectors are special in that they have non-scalarized writemasks,
960 * and just take the first swizzle channel for each argument in order
961 * into each writemask channel.
962 */
963 if (instr->op == nir_op_vec2 ||
964 instr->op == nir_op_vec3 ||
965 instr->op == nir_op_vec4) {
966 struct qreg srcs[4];
967 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
968 srcs[i] = ntq_get_src(c, instr->src[i].src,
969 instr->src[i].swizzle[0]);
970 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
971 ntq_store_dest(c, &instr->dest.dest, i,
972 vir_MOV(c, srcs[i]));
973 return;
974 }
975
976 /* General case: We can just grab the one used channel per src. */
977 struct qreg src[nir_op_infos[instr->op].num_inputs];
978 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
979 src[i] = ntq_get_alu_src(c, instr, i);
980 }
981
982 struct qreg result;
983
984 switch (instr->op) {
985 case nir_op_mov:
986 result = vir_MOV(c, src[0]);
987 break;
988
989 case nir_op_fneg:
990 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
991 break;
992 case nir_op_ineg:
993 result = vir_NEG(c, src[0]);
994 break;
995
996 case nir_op_fmul:
997 result = vir_FMUL(c, src[0], src[1]);
998 break;
999 case nir_op_fadd:
1000 result = vir_FADD(c, src[0], src[1]);
1001 break;
1002 case nir_op_fsub:
1003 result = vir_FSUB(c, src[0], src[1]);
1004 break;
1005 case nir_op_fmin:
1006 result = vir_FMIN(c, src[0], src[1]);
1007 break;
1008 case nir_op_fmax:
1009 result = vir_FMAX(c, src[0], src[1]);
1010 break;
1011
1012 case nir_op_f2i32: {
1013 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1014 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1015 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1016 } else {
1017 result = vir_FTOIZ(c, src[0]);
1018 }
1019 break;
1020 }
1021
1022 case nir_op_f2u32:
1023 result = vir_FTOUZ(c, src[0]);
1024 break;
1025 case nir_op_i2f32:
1026 result = vir_ITOF(c, src[0]);
1027 break;
1028 case nir_op_u2f32:
1029 result = vir_UTOF(c, src[0]);
1030 break;
1031 case nir_op_b2f32:
1032 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1033 break;
1034 case nir_op_b2i32:
1035 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1036 break;
1037
1038 case nir_op_iadd:
1039 result = vir_ADD(c, src[0], src[1]);
1040 break;
1041 case nir_op_ushr:
1042 result = vir_SHR(c, src[0], src[1]);
1043 break;
1044 case nir_op_isub:
1045 result = vir_SUB(c, src[0], src[1]);
1046 break;
1047 case nir_op_ishr:
1048 result = vir_ASR(c, src[0], src[1]);
1049 break;
1050 case nir_op_ishl:
1051 result = vir_SHL(c, src[0], src[1]);
1052 break;
1053 case nir_op_imin:
1054 result = vir_MIN(c, src[0], src[1]);
1055 break;
1056 case nir_op_umin:
1057 result = vir_UMIN(c, src[0], src[1]);
1058 break;
1059 case nir_op_imax:
1060 result = vir_MAX(c, src[0], src[1]);
1061 break;
1062 case nir_op_umax:
1063 result = vir_UMAX(c, src[0], src[1]);
1064 break;
1065 case nir_op_iand:
1066 result = vir_AND(c, src[0], src[1]);
1067 break;
1068 case nir_op_ior:
1069 result = vir_OR(c, src[0], src[1]);
1070 break;
1071 case nir_op_ixor:
1072 result = vir_XOR(c, src[0], src[1]);
1073 break;
1074 case nir_op_inot:
1075 result = vir_NOT(c, src[0]);
1076 break;
1077
1078 case nir_op_ufind_msb:
1079 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1080 break;
1081
1082 case nir_op_imul:
1083 result = vir_UMUL(c, src[0], src[1]);
1084 break;
1085
1086 case nir_op_seq:
1087 case nir_op_sne:
1088 case nir_op_sge:
1089 case nir_op_slt: {
1090 enum v3d_qpu_cond cond;
1091 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1092 assert(ok);
1093 result = vir_MOV(c, vir_SEL(c, cond,
1094 vir_uniform_f(c, 1.0),
1095 vir_uniform_f(c, 0.0)));
1096 break;
1097 }
1098
1099 case nir_op_i2b32:
1100 case nir_op_f2b32:
1101 case nir_op_feq32:
1102 case nir_op_fneu32:
1103 case nir_op_fge32:
1104 case nir_op_flt32:
1105 case nir_op_ieq32:
1106 case nir_op_ine32:
1107 case nir_op_ige32:
1108 case nir_op_uge32:
1109 case nir_op_ilt32:
1110 case nir_op_ult32: {
1111 enum v3d_qpu_cond cond;
1112 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1113 assert(ok);
1114 result = vir_MOV(c, vir_SEL(c, cond,
1115 vir_uniform_ui(c, ~0),
1116 vir_uniform_ui(c, 0)));
1117 break;
1118 }
1119
1120 case nir_op_b32csel:
1121 result = vir_MOV(c,
1122 vir_SEL(c,
1123 ntq_emit_bool_to_cond(c, instr->src[0].src),
1124 src[1], src[2]));
1125 break;
1126
1127 case nir_op_fcsel:
1128 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]),
1129 V3D_QPU_PF_PUSHZ);
1130 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1131 src[1], src[2]));
1132 break;
1133
1134 case nir_op_frcp:
1135 result = vir_RECIP(c, src[0]);
1136 break;
1137 case nir_op_frsq:
1138 result = vir_RSQRT(c, src[0]);
1139 break;
1140 case nir_op_fexp2:
1141 result = vir_EXP(c, src[0]);
1142 break;
1143 case nir_op_flog2:
1144 result = vir_LOG(c, src[0]);
1145 break;
1146
1147 case nir_op_fceil:
1148 result = vir_FCEIL(c, src[0]);
1149 break;
1150 case nir_op_ffloor:
1151 result = vir_FFLOOR(c, src[0]);
1152 break;
1153 case nir_op_fround_even:
1154 result = vir_FROUND(c, src[0]);
1155 break;
1156 case nir_op_ftrunc:
1157 result = vir_FTRUNC(c, src[0]);
1158 break;
1159
1160 case nir_op_fsin:
1161 result = ntq_fsincos(c, src[0], false);
1162 break;
1163 case nir_op_fcos:
1164 result = ntq_fsincos(c, src[0], true);
1165 break;
1166
1167 case nir_op_fsign:
1168 result = ntq_fsign(c, src[0]);
1169 break;
1170
1171 case nir_op_fabs: {
1172 result = vir_FMOV(c, src[0]);
1173 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1174 break;
1175 }
1176
1177 case nir_op_iabs:
1178 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1179 break;
1180
1181 case nir_op_fddx:
1182 case nir_op_fddx_coarse:
1183 case nir_op_fddx_fine:
1184 result = vir_FDX(c, src[0]);
1185 break;
1186
1187 case nir_op_fddy:
1188 case nir_op_fddy_coarse:
1189 case nir_op_fddy_fine:
1190 result = vir_FDY(c, src[0]);
1191 break;
1192
1193 case nir_op_uadd_carry:
1194 vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1195 V3D_QPU_PF_PUSHC);
1196 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1197 vir_uniform_ui(c, ~0),
1198 vir_uniform_ui(c, 0)));
1199 break;
1200
1201 case nir_op_pack_half_2x16_split:
1202 result = vir_VFPACK(c, src[0], src[1]);
1203 break;
1204
1205 case nir_op_unpack_half_2x16_split_x:
1206 result = vir_FMOV(c, src[0]);
1207 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1208 break;
1209
1210 case nir_op_unpack_half_2x16_split_y:
1211 result = vir_FMOV(c, src[0]);
1212 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1213 break;
1214
1215 case nir_op_fquantize2f16: {
1216 /* F32 -> F16 -> F32 conversion */
1217 struct qreg tmp = vir_FMOV(c, src[0]);
1218 vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
1219 tmp = vir_FMOV(c, tmp);
1220 vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
1221
1222 /* Check for denorm */
1223 struct qreg abs_src = vir_FMOV(c, src[0]);
1224 vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
1225 struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
1226 vir_set_pf(vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
1227 V3D_QPU_PF_PUSHC);
1228
1229 /* Return +/-0 for denorms */
1230 struct qreg zero =
1231 vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
1232 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
1233 break;
1234 }
1235
1236 default:
1237 fprintf(stderr, "unknown NIR ALU inst: ");
1238 nir_print_instr(&instr->instr, stderr);
1239 fprintf(stderr, "\n");
1240 abort();
1241 }
1242
1243 /* We have a scalar result, so the instruction should only have a
1244 * single channel written to.
1245 */
1246 assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1247 ntq_store_dest(c, &instr->dest.dest,
1248 ffs(instr->dest.write_mask) - 1, result);
1249 }
1250
1251 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1252 * specifier. They come from a register that's preloaded with 0xffffffff
1253 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1254 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1255 */
1256 #define TLB_TYPE_F16_COLOR (3 << 6)
1257 #define TLB_TYPE_I32_COLOR (1 << 6)
1258 #define TLB_TYPE_F32_COLOR (0 << 6)
1259 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1260 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1261 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1262 #define TLB_F16_SWAP_HI_LO (1 << 1)
1263 #define TLB_VEC_SIZE_4_F16 (1 << 0)
1264 #define TLB_VEC_SIZE_2_F16 (0 << 0)
1265 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1266
1267 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1268 * flag is set.
1269 */
1270 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1271 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1272 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1273 #define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */
1274 #define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */
1275
1276 /* Stencil is a single 32-bit write. */
1277 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1278
1279 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1280 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1281 {
1282 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1283 return;
1284
1285 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1286 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1287
1288 nir_variable *var = c->output_color_var[rt];
1289 int num_components = glsl_get_vector_elements(var->type);
1290 uint32_t conf = 0xffffff00;
1291 struct qinst *inst;
1292
1293 conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1294 TLB_SAMPLE_MODE_PER_PIXEL;
1295 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1296
1297 if (c->fs_key->swap_color_rb & (1 << rt))
1298 num_components = MAX2(num_components, 3);
1299 assert(num_components != 0);
1300
1301 enum glsl_base_type type = glsl_get_base_type(var->type);
1302 bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1303 bool is_32b_tlb_format = is_int_format ||
1304 (c->fs_key->f32_color_rb & (1 << rt));
1305
1306 if (is_int_format) {
1307 /* The F32 vs I32 distinction was dropped in 4.2. */
1308 if (c->devinfo->ver < 42)
1309 conf |= TLB_TYPE_I32_COLOR;
1310 else
1311 conf |= TLB_TYPE_F32_COLOR;
1312 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1313 } else {
1314 if (c->fs_key->f32_color_rb & (1 << rt)) {
1315 conf |= TLB_TYPE_F32_COLOR;
1316 conf |= ((num_components - 1) <<
1317 TLB_VEC_SIZE_MINUS_1_SHIFT);
1318 } else {
1319 conf |= TLB_TYPE_F16_COLOR;
1320 conf |= TLB_F16_SWAP_HI_LO;
1321 if (num_components >= 3)
1322 conf |= TLB_VEC_SIZE_4_F16;
1323 else
1324 conf |= TLB_VEC_SIZE_2_F16;
1325 }
1326 }
1327
1328 int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1329 for (int i = 0; i < num_samples; i++) {
1330 struct qreg *color = c->msaa_per_sample_output ?
1331 &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1332 &c->outputs[var->data.driver_location * 4];
1333
1334 struct qreg r = color[0];
1335 struct qreg g = color[1];
1336 struct qreg b = color[2];
1337 struct qreg a = color[3];
1338
1339 if (c->fs_key->swap_color_rb & (1 << rt)) {
1340 r = color[2];
1341 b = color[0];
1342 }
1343
1344 if (c->fs_key->sample_alpha_to_one)
1345 a = vir_uniform_f(c, 1.0);
1346
1347 if (is_32b_tlb_format) {
1348 if (i == 0) {
1349 inst = vir_MOV_dest(c, tlbu_reg, r);
1350 inst->uniform =
1351 vir_get_uniform_index(c,
1352 QUNIFORM_CONSTANT,
1353 conf);
1354 } else {
1355 inst = vir_MOV_dest(c, tlb_reg, r);
1356 }
1357
1358 if (num_components >= 2)
1359 vir_MOV_dest(c, tlb_reg, g);
1360 if (num_components >= 3)
1361 vir_MOV_dest(c, tlb_reg, b);
1362 if (num_components >= 4)
1363 vir_MOV_dest(c, tlb_reg, a);
1364 } else {
1365 inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1366 if (conf != ~0 && i == 0) {
1367 inst->dst = tlbu_reg;
1368 inst->uniform =
1369 vir_get_uniform_index(c,
1370 QUNIFORM_CONSTANT,
1371 conf);
1372 }
1373
1374 if (num_components >= 3)
1375 inst = vir_VFPACK_dest(c, tlb_reg, b, a);
1376 }
1377 }
1378 }
1379
1380 static void
emit_frag_end(struct v3d_compile * c)1381 emit_frag_end(struct v3d_compile *c)
1382 {
1383 if (c->output_sample_mask_index != -1) {
1384 vir_SETMSF_dest(c, vir_nop_reg(),
1385 vir_AND(c,
1386 vir_MSF(c),
1387 c->outputs[c->output_sample_mask_index]));
1388 }
1389
1390 bool has_any_tlb_color_write = false;
1391 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1392 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1393 has_any_tlb_color_write = true;
1394 }
1395
1396 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1397 struct nir_variable *var = c->output_color_var[0];
1398 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1399
1400 vir_SETMSF_dest(c, vir_nop_reg(),
1401 vir_AND(c,
1402 vir_MSF(c),
1403 vir_FTOC(c, color[3])));
1404 }
1405
1406 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1407 if (c->output_position_index != -1) {
1408 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1409 c->outputs[c->output_position_index]);
1410 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1411
1412 if (c->devinfo->ver >= 42) {
1413 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1414 TLB_SAMPLE_MODE_PER_PIXEL);
1415 } else
1416 tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1417
1418 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1419 tlb_specifier |
1420 0xffffff00);
1421 c->writes_z = true;
1422 } else if (c->s->info.fs.uses_discard ||
1423 !c->s->info.fs.early_fragment_tests ||
1424 c->fs_key->sample_alpha_to_coverage ||
1425 !has_any_tlb_color_write) {
1426 /* Emit passthrough Z if it needed to be delayed until shader
1427 * end due to potential discards.
1428 *
1429 * Since (single-threaded) fragment shaders always need a TLB
1430 * write, emit passthrouh Z if we didn't have any color
1431 * buffers and flag us as potentially discarding, so that we
1432 * can use Z as the TLB write.
1433 */
1434 c->s->info.fs.uses_discard = true;
1435
1436 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1437 vir_nop_reg());
1438 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1439
1440 if (c->devinfo->ver >= 42) {
1441 /* The spec says the PER_PIXEL flag is ignored for
1442 * invariant writes, but the simulator demands it.
1443 */
1444 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1445 TLB_SAMPLE_MODE_PER_PIXEL);
1446 } else {
1447 tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1448 }
1449
1450 inst->uniform = vir_get_uniform_index(c,
1451 QUNIFORM_CONSTANT,
1452 tlb_specifier |
1453 0xffffff00);
1454 c->writes_z = true;
1455 }
1456
1457 /* XXX: Performance improvement: Merge Z write and color writes TLB
1458 * uniform setup
1459 */
1460 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
1461 vir_emit_tlb_color_write(c, rt);
1462 }
1463
1464 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index)1465 vir_VPM_WRITE_indirect(struct v3d_compile *c,
1466 struct qreg val,
1467 struct qreg vpm_index)
1468 {
1469 assert(c->devinfo->ver >= 40);
1470 vir_STVPMV(c, vpm_index, val);
1471 }
1472
1473 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)1474 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
1475 {
1476 if (c->devinfo->ver >= 40) {
1477 vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
1478 } else {
1479 /* XXX: v3d33_vir_vpm_write_setup(c); */
1480 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1481 }
1482 }
1483
1484 static void
emit_vert_end(struct v3d_compile * c)1485 emit_vert_end(struct v3d_compile *c)
1486 {
1487 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1488 */
1489 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1490 vir_VPMWT(c);
1491 }
1492
1493 static void
emit_geom_end(struct v3d_compile * c)1494 emit_geom_end(struct v3d_compile *c)
1495 {
1496 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1497 */
1498 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1499 vir_VPMWT(c);
1500 }
1501
1502 void
v3d_optimize_nir(struct nir_shader * s)1503 v3d_optimize_nir(struct nir_shader *s)
1504 {
1505 bool progress;
1506 unsigned lower_flrp =
1507 (s->options->lower_flrp16 ? 16 : 0) |
1508 (s->options->lower_flrp32 ? 32 : 0) |
1509 (s->options->lower_flrp64 ? 64 : 0);
1510
1511 do {
1512 progress = false;
1513
1514 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1515 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1516 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1517 NIR_PASS(progress, s, nir_copy_prop);
1518 NIR_PASS(progress, s, nir_opt_remove_phis);
1519 NIR_PASS(progress, s, nir_opt_dce);
1520 NIR_PASS(progress, s, nir_opt_dead_cf);
1521 NIR_PASS(progress, s, nir_opt_cse);
1522 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1523 NIR_PASS(progress, s, nir_opt_algebraic);
1524 NIR_PASS(progress, s, nir_opt_constant_folding);
1525
1526 if (lower_flrp != 0) {
1527 bool lower_flrp_progress = false;
1528
1529 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1530 lower_flrp,
1531 false /* always_precise */);
1532 if (lower_flrp_progress) {
1533 NIR_PASS(progress, s, nir_opt_constant_folding);
1534 progress = true;
1535 }
1536
1537 /* Nothing should rematerialize any flrps, so we only
1538 * need to do this lowering once.
1539 */
1540 lower_flrp = 0;
1541 }
1542
1543 NIR_PASS(progress, s, nir_opt_undef);
1544 } while (progress);
1545
1546 NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
1547 }
1548
1549 static int
driver_location_compare(const void * in_a,const void * in_b)1550 driver_location_compare(const void *in_a, const void *in_b)
1551 {
1552 const nir_variable *const *a = in_a;
1553 const nir_variable *const *b = in_b;
1554
1555 if ((*a)->data.driver_location == (*b)->data.driver_location)
1556 return (*a)->data.location_frac - (*b)->data.location_frac;
1557
1558 return (*a)->data.driver_location - (*b)->data.driver_location;
1559 }
1560
1561 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t * num_components_queued,uint32_t * remaining,uint32_t vpm_index)1562 ntq_emit_vpm_read(struct v3d_compile *c,
1563 uint32_t *num_components_queued,
1564 uint32_t *remaining,
1565 uint32_t vpm_index)
1566 {
1567 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1568
1569 if (c->devinfo->ver >= 40 ) {
1570 return vir_LDVPMV_IN(c,
1571 vir_uniform_ui(c,
1572 (*num_components_queued)++));
1573 }
1574
1575 if (*num_components_queued != 0) {
1576 (*num_components_queued)--;
1577 return vir_MOV(c, vpm);
1578 }
1579
1580 uint32_t num_components = MIN2(*remaining, 32);
1581
1582 v3d33_vir_vpm_read_setup(c, num_components);
1583
1584 *num_components_queued = num_components - 1;
1585 *remaining -= num_components;
1586
1587 return vir_MOV(c, vpm);
1588 }
1589
1590 static void
ntq_setup_vs_inputs(struct v3d_compile * c)1591 ntq_setup_vs_inputs(struct v3d_compile *c)
1592 {
1593 /* Figure out how many components of each vertex attribute the shader
1594 * uses. Each variable should have been split to individual
1595 * components and unused ones DCEed. The vertex fetcher will load
1596 * from the start of the attribute to the number of components we
1597 * declare we need in c->vattr_sizes[].
1598 *
1599 * BGRA vertex attributes are a bit special: since we implement these
1600 * as RGBA swapping R/B components we always need at least 3 components
1601 * if component 0 is read.
1602 */
1603 nir_foreach_shader_in_variable(var, c->s) {
1604 /* No VS attribute array support. */
1605 assert(MAX2(glsl_get_length(var->type), 1) == 1);
1606
1607 unsigned loc = var->data.driver_location;
1608 int start_component = var->data.location_frac;
1609 int num_components = glsl_get_components(var->type);
1610
1611 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
1612 start_component + num_components);
1613
1614 /* Handle BGRA inputs */
1615 if (start_component == 0 &&
1616 c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
1617 c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
1618 }
1619 }
1620
1621 unsigned num_components = 0;
1622 uint32_t vpm_components_queued = 0;
1623 bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
1624 SYSTEM_VALUE_INSTANCE_ID) ||
1625 BITSET_TEST(c->s->info.system_values_read,
1626 SYSTEM_VALUE_INSTANCE_INDEX);
1627 bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
1628 SYSTEM_VALUE_BASE_INSTANCE);
1629 bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
1630 SYSTEM_VALUE_VERTEX_ID) ||
1631 BITSET_TEST(c->s->info.system_values_read,
1632 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
1633
1634 num_components += uses_iid;
1635 num_components += uses_biid;
1636 num_components += uses_vid;
1637
1638 for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
1639 num_components += c->vattr_sizes[i];
1640
1641 if (uses_iid) {
1642 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1643 &num_components, ~0);
1644 }
1645
1646 if (uses_biid) {
1647 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
1648 &num_components, ~0);
1649 }
1650
1651 if (uses_vid) {
1652 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1653 &num_components, ~0);
1654 }
1655
1656 /* The actual loads will happen directly in nir_intrinsic_load_input
1657 * on newer versions.
1658 */
1659 if (c->devinfo->ver >= 40)
1660 return;
1661
1662 for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
1663 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1664 (loc + 1) * 4);
1665
1666 for (int i = 0; i < c->vattr_sizes[loc]; i++) {
1667 c->inputs[loc * 4 + i] =
1668 ntq_emit_vpm_read(c,
1669 &vpm_components_queued,
1670 &num_components,
1671 loc * 4 + i);
1672
1673 }
1674 }
1675
1676 if (c->devinfo->ver >= 40) {
1677 assert(vpm_components_queued == num_components);
1678 } else {
1679 assert(vpm_components_queued == 0);
1680 assert(num_components == 0);
1681 }
1682 }
1683
1684 static bool
program_reads_point_coord(struct v3d_compile * c)1685 program_reads_point_coord(struct v3d_compile *c)
1686 {
1687 nir_foreach_shader_in_variable(var, c->s) {
1688 if (util_varying_is_point_coord(var->data.location,
1689 c->fs_key->point_sprite_mask)) {
1690 return true;
1691 }
1692 }
1693
1694 return false;
1695 }
1696
1697 static void
get_sorted_input_variables(struct v3d_compile * c,unsigned * num_entries,nir_variable *** vars)1698 get_sorted_input_variables(struct v3d_compile *c,
1699 unsigned *num_entries,
1700 nir_variable ***vars)
1701 {
1702 *num_entries = 0;
1703 nir_foreach_shader_in_variable(var, c->s)
1704 (*num_entries)++;
1705
1706 *vars = ralloc_array(c, nir_variable *, *num_entries);
1707
1708 unsigned i = 0;
1709 nir_foreach_shader_in_variable(var, c->s)
1710 (*vars)[i++] = var;
1711
1712 /* Sort the variables so that we emit the input setup in
1713 * driver_location order. This is required for VPM reads, whose data
1714 * is fetched into the VPM in driver_location (TGSI register index)
1715 * order.
1716 */
1717 qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
1718 }
1719
1720 static void
ntq_setup_gs_inputs(struct v3d_compile * c)1721 ntq_setup_gs_inputs(struct v3d_compile *c)
1722 {
1723 nir_variable **vars;
1724 unsigned num_entries;
1725 get_sorted_input_variables(c, &num_entries, &vars);
1726
1727 for (unsigned i = 0; i < num_entries; i++) {
1728 nir_variable *var = vars[i];
1729
1730 /* All GS inputs are arrays with as many entries as vertices
1731 * in the input primitive, but here we only care about the
1732 * per-vertex input type.
1733 */
1734 const struct glsl_type *type = glsl_without_array(var->type);
1735 unsigned array_len = MAX2(glsl_get_length(type), 1);
1736 unsigned loc = var->data.driver_location;
1737
1738 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1739 (loc + array_len) * 4);
1740
1741 for (unsigned j = 0; j < array_len; j++) {
1742 unsigned num_elements = glsl_get_vector_elements(type);
1743 for (unsigned k = 0; k < num_elements; k++) {
1744 unsigned chan = var->data.location_frac + k;
1745 unsigned input_idx = c->num_inputs++;
1746 struct v3d_varying_slot slot =
1747 v3d_slot_from_slot_and_component(var->data.location + j, chan);
1748 c->input_slots[input_idx] = slot;
1749 }
1750 }
1751 }
1752 }
1753
1754
1755 static void
ntq_setup_fs_inputs(struct v3d_compile * c)1756 ntq_setup_fs_inputs(struct v3d_compile *c)
1757 {
1758 nir_variable **vars;
1759 unsigned num_entries;
1760 get_sorted_input_variables(c, &num_entries, &vars);
1761
1762 for (unsigned i = 0; i < num_entries; i++) {
1763 nir_variable *var = vars[i];
1764 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
1765 unsigned loc = var->data.driver_location;
1766
1767 uint32_t inputs_array_size = c->inputs_array_size;
1768 uint32_t inputs_array_required_size = (loc + var_len) * 4;
1769 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1770 inputs_array_required_size);
1771 resize_interp_array(c, &c->interp, &inputs_array_size,
1772 inputs_array_required_size);
1773
1774 if (var->data.location == VARYING_SLOT_POS) {
1775 emit_fragcoord_input(c, loc);
1776 } else if (util_varying_is_point_coord(var->data.location,
1777 c->fs_key->point_sprite_mask)) {
1778 c->inputs[loc * 4 + 0] = c->point_x;
1779 c->inputs[loc * 4 + 1] = c->point_y;
1780 } else if (var->data.compact) {
1781 for (int j = 0; j < var_len; j++)
1782 emit_compact_fragment_input(c, loc, var, j);
1783 } else if (glsl_type_is_struct(var->type)) {
1784 for (int j = 0; j < var_len; j++) {
1785 emit_fragment_input(c, loc, var, j, 4);
1786 }
1787 } else {
1788 for (int j = 0; j < var_len; j++) {
1789 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
1790 }
1791 }
1792 }
1793 }
1794
1795 static void
ntq_setup_outputs(struct v3d_compile * c)1796 ntq_setup_outputs(struct v3d_compile *c)
1797 {
1798 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
1799 return;
1800
1801 nir_foreach_shader_out_variable(var, c->s) {
1802 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1803 unsigned loc = var->data.driver_location * 4;
1804
1805 assert(array_len == 1);
1806 (void)array_len;
1807
1808 for (int i = 0; i < 4 - var->data.location_frac; i++) {
1809 add_output(c, loc + var->data.location_frac + i,
1810 var->data.location,
1811 var->data.location_frac + i);
1812 }
1813
1814 switch (var->data.location) {
1815 case FRAG_RESULT_COLOR:
1816 c->output_color_var[0] = var;
1817 c->output_color_var[1] = var;
1818 c->output_color_var[2] = var;
1819 c->output_color_var[3] = var;
1820 break;
1821 case FRAG_RESULT_DATA0:
1822 case FRAG_RESULT_DATA1:
1823 case FRAG_RESULT_DATA2:
1824 case FRAG_RESULT_DATA3:
1825 c->output_color_var[var->data.location -
1826 FRAG_RESULT_DATA0] = var;
1827 break;
1828 case FRAG_RESULT_DEPTH:
1829 c->output_position_index = loc;
1830 break;
1831 case FRAG_RESULT_SAMPLE_MASK:
1832 c->output_sample_mask_index = loc;
1833 break;
1834 }
1835 }
1836 }
1837
1838 /**
1839 * Sets up the mapping from nir_register to struct qreg *.
1840 *
1841 * Each nir_register gets a struct qreg per 32-bit component being stored.
1842 */
1843 static void
ntq_setup_registers(struct v3d_compile * c,struct exec_list * list)1844 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1845 {
1846 foreach_list_typed(nir_register, nir_reg, node, list) {
1847 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1848 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1849 array_len *
1850 nir_reg->num_components);
1851
1852 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1853
1854 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1855 qregs[i] = vir_get_temp(c);
1856 }
1857 }
1858
1859 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)1860 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1861 {
1862 /* XXX perf: Experiment with using immediate loads to avoid having
1863 * these end up in the uniform stream. Watch out for breaking the
1864 * small immediates optimization in the process!
1865 */
1866 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1867 for (int i = 0; i < instr->def.num_components; i++)
1868 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
1869
1870 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1871 }
1872
1873 static void
ntq_emit_ssa_undef(struct v3d_compile * c,nir_ssa_undef_instr * instr)1874 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1875 {
1876 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1877
1878 /* VIR needs there to be *some* value, so pick 0 (same as for
1879 * ntq_setup_registers().
1880 */
1881 for (int i = 0; i < instr->def.num_components; i++)
1882 qregs[i] = vir_uniform_ui(c, 0);
1883 }
1884
1885 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)1886 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
1887 {
1888 unsigned image_index = nir_src_as_uint(instr->src[0]);
1889 bool is_array = nir_intrinsic_image_array(instr);
1890
1891 assert(nir_src_as_uint(instr->src[1]) == 0);
1892
1893 ntq_store_dest(c, &instr->dest, 0,
1894 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
1895 if (instr->num_components > 1) {
1896 ntq_store_dest(c, &instr->dest, 1,
1897 vir_uniform(c,
1898 instr->num_components == 2 && is_array ?
1899 QUNIFORM_IMAGE_ARRAY_SIZE :
1900 QUNIFORM_IMAGE_HEIGHT,
1901 image_index));
1902 }
1903 if (instr->num_components > 2) {
1904 ntq_store_dest(c, &instr->dest, 2,
1905 vir_uniform(c,
1906 is_array ?
1907 QUNIFORM_IMAGE_ARRAY_SIZE :
1908 QUNIFORM_IMAGE_DEPTH,
1909 image_index));
1910 }
1911 }
1912
1913 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)1914 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
1915 {
1916 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
1917
1918 int rt = nir_src_as_uint(instr->src[0]);
1919 assert(rt < V3D_MAX_DRAW_BUFFERS);
1920
1921 int sample_index = nir_intrinsic_base(instr) ;
1922 assert(sample_index < V3D_MAX_SAMPLES);
1923
1924 int component = nir_intrinsic_component(instr);
1925 assert(component < 4);
1926
1927 /* We need to emit our TLB reads after we have acquired the scoreboard
1928 * lock, or the GPU will hang. Usually, we do our scoreboard locking on
1929 * the last thread switch to improve parallelism, however, that is only
1930 * guaranteed to happen before the tlb color writes.
1931 *
1932 * To fix that, we make sure we always emit a thread switch before the
1933 * first tlb color read. If that happens to be the last thread switch
1934 * we emit, then everything is fine, but otherwsie, if any code after
1935 * this point needs to emit additional thread switches, then we will
1936 * switch the strategy to locking the scoreboard on the first thread
1937 * switch instead -- see vir_emit_thrsw().
1938 */
1939 if (!c->emitted_tlb_load) {
1940 if (!c->last_thrsw_at_top_level) {
1941 assert(c->devinfo->ver >= 41);
1942 vir_emit_thrsw(c);
1943 }
1944
1945 c->emitted_tlb_load = true;
1946 }
1947
1948 struct qreg *color_reads_for_sample =
1949 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
1950
1951 if (color_reads_for_sample[component].file == QFILE_NULL) {
1952 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
1953 int num_components =
1954 util_format_get_nr_components(rt_format);
1955
1956 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
1957 if (swap_rb)
1958 num_components = MAX2(num_components, 3);
1959
1960 nir_variable *var = c->output_color_var[rt];
1961 enum glsl_base_type type = glsl_get_base_type(var->type);
1962
1963 bool is_int_format = type == GLSL_TYPE_INT ||
1964 type == GLSL_TYPE_UINT;
1965
1966 bool is_32b_tlb_format = is_int_format ||
1967 (c->fs_key->f32_color_rb & (1 << rt));
1968
1969 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
1970
1971 uint32_t conf = 0xffffff00;
1972 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
1973 TLB_SAMPLE_MODE_PER_PIXEL;
1974 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1975
1976 if (is_32b_tlb_format) {
1977 /* The F32 vs I32 distinction was dropped in 4.2. */
1978 conf |= (c->devinfo->ver < 42 && is_int_format) ?
1979 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
1980
1981 conf |= ((num_components - 1) <<
1982 TLB_VEC_SIZE_MINUS_1_SHIFT);
1983 } else {
1984 conf |= TLB_TYPE_F16_COLOR;
1985 conf |= TLB_F16_SWAP_HI_LO;
1986
1987 if (num_components >= 3)
1988 conf |= TLB_VEC_SIZE_4_F16;
1989 else
1990 conf |= TLB_VEC_SIZE_2_F16;
1991 }
1992
1993
1994 for (int i = 0; i < num_samples; i++) {
1995 struct qreg r, g, b, a;
1996 if (is_32b_tlb_format) {
1997 r = conf != 0xffffffff && i == 0?
1998 vir_TLBU_COLOR_READ(c, conf) :
1999 vir_TLB_COLOR_READ(c);
2000 if (num_components >= 2)
2001 g = vir_TLB_COLOR_READ(c);
2002 if (num_components >= 3)
2003 b = vir_TLB_COLOR_READ(c);
2004 if (num_components >= 4)
2005 a = vir_TLB_COLOR_READ(c);
2006 } else {
2007 struct qreg rg = conf != 0xffffffff && i == 0 ?
2008 vir_TLBU_COLOR_READ(c, conf) :
2009 vir_TLB_COLOR_READ(c);
2010 r = vir_FMOV(c, rg);
2011 vir_set_unpack(c->defs[r.index], 0,
2012 V3D_QPU_UNPACK_L);
2013 g = vir_FMOV(c, rg);
2014 vir_set_unpack(c->defs[g.index], 0,
2015 V3D_QPU_UNPACK_H);
2016
2017 if (num_components > 2) {
2018 struct qreg ba = vir_TLB_COLOR_READ(c);
2019 b = vir_FMOV(c, ba);
2020 vir_set_unpack(c->defs[b.index], 0,
2021 V3D_QPU_UNPACK_L);
2022 a = vir_FMOV(c, ba);
2023 vir_set_unpack(c->defs[a.index], 0,
2024 V3D_QPU_UNPACK_H);
2025 }
2026 }
2027
2028 struct qreg *color_reads =
2029 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2030
2031 color_reads[0] = swap_rb ? b : r;
2032 if (num_components >= 2)
2033 color_reads[1] = g;
2034 if (num_components >= 3)
2035 color_reads[2] = swap_rb ? r : b;
2036 if (num_components >= 4)
2037 color_reads[3] = a;
2038 }
2039 }
2040
2041 assert(color_reads_for_sample[component].file != QFILE_NULL);
2042 ntq_store_dest(c, &instr->dest, 0,
2043 vir_MOV(c, color_reads_for_sample[component]));
2044 }
2045
2046 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2047 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2048 {
2049 if (nir_src_is_const(instr->src[0])) {
2050 int offset = (nir_intrinsic_base(instr) +
2051 nir_src_as_uint(instr->src[0]));
2052 assert(offset % 4 == 0);
2053 /* We need dwords */
2054 offset = offset / 4;
2055 for (int i = 0; i < instr->num_components; i++) {
2056 ntq_store_dest(c, &instr->dest, i,
2057 vir_uniform(c, QUNIFORM_UNIFORM,
2058 offset + i));
2059 }
2060 } else {
2061 ntq_emit_tmu_general(c, instr, false);
2062 }
2063 }
2064
2065 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2066 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2067 {
2068 /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
2069 * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
2070 */
2071 unsigned offset =
2072 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2073
2074 if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
2075 /* Emit the LDVPM directly now, rather than at the top
2076 * of the shader like we did for V3D 3.x (which needs
2077 * vpmsetup when not just taking the next offset).
2078 *
2079 * Note that delaying like this may introduce stalls,
2080 * as LDVPMV takes a minimum of 1 instruction but may
2081 * be slower if the VPM unit is busy with another QPU.
2082 */
2083 int index = 0;
2084 if (BITSET_TEST(c->s->info.system_values_read,
2085 SYSTEM_VALUE_INSTANCE_ID)) {
2086 index++;
2087 }
2088 if (BITSET_TEST(c->s->info.system_values_read,
2089 SYSTEM_VALUE_BASE_INSTANCE)) {
2090 index++;
2091 }
2092 if (BITSET_TEST(c->s->info.system_values_read,
2093 SYSTEM_VALUE_VERTEX_ID)) {
2094 index++;
2095 }
2096 for (int i = 0; i < offset; i++)
2097 index += c->vattr_sizes[i];
2098 index += nir_intrinsic_component(instr);
2099 for (int i = 0; i < instr->num_components; i++) {
2100 struct qreg vpm_offset = vir_uniform_ui(c, index++);
2101 ntq_store_dest(c, &instr->dest, i,
2102 vir_LDVPMV_IN(c, vpm_offset));
2103 }
2104 } else {
2105 for (int i = 0; i < instr->num_components; i++) {
2106 int comp = nir_intrinsic_component(instr) + i;
2107 ntq_store_dest(c, &instr->dest, i,
2108 vir_MOV(c, c->inputs[offset * 4 + comp]));
2109 }
2110 }
2111 }
2112
2113 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2114 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2115 nir_intrinsic_instr *instr)
2116 {
2117 assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2118
2119 unsigned rt = nir_src_as_uint(instr->src[1]);
2120 assert(rt < V3D_MAX_DRAW_BUFFERS);
2121
2122 unsigned sample_idx = nir_intrinsic_base(instr);
2123 assert(sample_idx < V3D_MAX_SAMPLES);
2124
2125 unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2126 for (int i = 0; i < instr->num_components; i++) {
2127 c->sample_colors[offset + i] =
2128 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2129 }
2130 }
2131
2132 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2133 ntq_emit_color_write(struct v3d_compile *c,
2134 nir_intrinsic_instr *instr)
2135 {
2136 unsigned offset = (nir_intrinsic_base(instr) +
2137 nir_src_as_uint(instr->src[1])) * 4 +
2138 nir_intrinsic_component(instr);
2139 for (int i = 0; i < instr->num_components; i++) {
2140 c->outputs[offset + i] =
2141 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2142 }
2143 }
2144
2145 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2146 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2147 {
2148 assert(instr->num_components == 1);
2149
2150 struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2151
2152 uint32_t base_offset = nir_intrinsic_base(instr);
2153
2154 if (base_offset)
2155 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2156
2157 /* Usually, for VS or FS, we only emit outputs once at program end so
2158 * our VPM writes are never in non-uniform control flow, but this
2159 * is not true for GS, where we are emitting multiple vertices.
2160 */
2161 if (vir_in_nonuniform_control_flow(c)) {
2162 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
2163 V3D_QPU_PF_PUSHZ);
2164 }
2165
2166 struct qreg val = ntq_get_src(c, instr->src[0], 0);
2167
2168 /* The offset isn’t necessarily dynamically uniform for a geometry
2169 * shader. This can happen if the shader sometimes doesn’t emit one of
2170 * the vertices. In that case subsequent vertices will be written to
2171 * different offsets in the VPM and we need to use the scatter write
2172 * instruction to have a different offset for each lane.
2173 */
2174 if (nir_src_is_dynamically_uniform(instr->src[1]))
2175 vir_VPM_WRITE_indirect(c, val, offset);
2176 else
2177 vir_STVPMD(c, offset, val);
2178
2179 if (vir_in_nonuniform_control_flow(c)) {
2180 struct qinst *last_inst =
2181 (struct qinst *)c->cur_block->instructions.prev;
2182 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2183 }
2184 }
2185
2186 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2187 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2188 {
2189 /* XXX perf: Use stvpmv with uniform non-constant offsets and
2190 * stvpmd with non-uniform offsets and enable
2191 * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
2192 */
2193 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
2194 ntq_emit_color_write(c, instr);
2195 } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) {
2196 emit_store_output_gs(c, instr);
2197 } else {
2198 assert(c->s->info.stage == MESA_SHADER_VERTEX);
2199 assert(instr->num_components == 1);
2200
2201 uint32_t base = nir_intrinsic_base(instr);
2202 if (nir_src_is_const(instr->src[1])) {
2203 vir_VPM_WRITE(c,
2204 ntq_get_src(c, instr->src[0], 0),
2205 base + nir_src_as_uint(instr->src[1]));
2206 } else {
2207 vir_VPM_WRITE_indirect(c,
2208 ntq_get_src(c, instr->src[0], 0),
2209 vir_ADD(c,
2210 ntq_get_src(c, instr->src[1], 1),
2211 vir_uniform_ui(c, base)));
2212 }
2213 }
2214 }
2215
2216 /**
2217 * This implementation is based on v3d_sample_{x,y}_offset() from
2218 * v3d_sample_offset.h.
2219 */
2220 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2221 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2222 struct qreg *sx, struct qreg *sy)
2223 {
2224 sample_idx = vir_ITOF(c, sample_idx);
2225
2226 struct qreg offset_x =
2227 vir_FADD(c, vir_uniform_f(c, -0.125f),
2228 vir_FMUL(c, sample_idx,
2229 vir_uniform_f(c, 0.5f)));
2230 vir_set_pf(vir_FCMP_dest(c, vir_nop_reg(),
2231 vir_uniform_f(c, 2.0f), sample_idx),
2232 V3D_QPU_PF_PUSHC);
2233 offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2234 vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2235 offset_x);
2236
2237 struct qreg offset_y =
2238 vir_FADD(c, vir_uniform_f(c, -0.375f),
2239 vir_FMUL(c, sample_idx,
2240 vir_uniform_f(c, 0.25f)));
2241 *sx = offset_x;
2242 *sy = offset_y;
2243 }
2244
2245 /**
2246 * This implementation is based on get_centroid_offset() from fep.c.
2247 */
2248 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2249 ntq_get_barycentric_centroid(struct v3d_compile *c,
2250 struct qreg *out_x,
2251 struct qreg *out_y)
2252 {
2253 struct qreg sample_mask;
2254 if (c->output_sample_mask_index != -1)
2255 sample_mask = c->outputs[c->output_sample_mask_index];
2256 else
2257 sample_mask = vir_MSF(c);
2258
2259 struct qreg i0 = vir_uniform_ui(c, 0);
2260 struct qreg i1 = vir_uniform_ui(c, 1);
2261 struct qreg i2 = vir_uniform_ui(c, 2);
2262 struct qreg i3 = vir_uniform_ui(c, 3);
2263 struct qreg i4 = vir_uniform_ui(c, 4);
2264 struct qreg i8 = vir_uniform_ui(c, 8);
2265
2266 /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2267 struct qreg F = vir_uniform_ui(c, 0);
2268 struct qreg T = vir_uniform_ui(c, ~0);
2269 struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
2270 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2271 s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2272 struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
2273 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2274 s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2275 struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
2276 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2277 s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2278 struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
2279 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2280 s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2281
2282 /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2283 struct qreg sample_idx = i3;
2284 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2285 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2286 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2287 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2288 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2289 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2290
2291 /* Get offset at selected sample index */
2292 struct qreg offset_x, offset_y;
2293 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2294
2295 /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2296 * are selected.
2297 */
2298 struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2299 struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2300
2301 struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2302 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2303 use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2304 use_center = vir_OR(c, use_center, s0_and_s3);
2305 use_center = vir_OR(c, use_center, s1_and_s2);
2306
2307 struct qreg zero = vir_uniform_f(c, 0.0f);
2308 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2309 offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
2310 offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
2311
2312 *out_x = offset_x;
2313 *out_y = offset_y;
2314 }
2315
2316 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)2317 ntq_emit_load_interpolated_input(struct v3d_compile *c,
2318 struct qreg p,
2319 struct qreg C,
2320 struct qreg offset_x,
2321 struct qreg offset_y,
2322 unsigned mode)
2323 {
2324 if (mode == INTERP_MODE_FLAT)
2325 return C;
2326
2327 struct qreg sample_offset_x =
2328 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2329 struct qreg sample_offset_y =
2330 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2331
2332 struct qreg scaleX =
2333 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
2334 offset_x);
2335 struct qreg scaleY =
2336 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
2337 offset_y);
2338
2339 struct qreg pInterp =
2340 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
2341 vir_FMUL(c, vir_FDY(c, p), scaleY)));
2342
2343 if (mode == INTERP_MODE_NOPERSPECTIVE)
2344 return vir_FADD(c, pInterp, C);
2345
2346 struct qreg w = c->payload_w;
2347 struct qreg wInterp =
2348 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
2349 vir_FMUL(c, vir_FDY(c, w), scaleY)));
2350
2351 return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
2352 }
2353
2354 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)2355 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
2356 {
2357 switch (instr->intrinsic) {
2358 case nir_intrinsic_load_uniform:
2359 ntq_emit_load_uniform(c, instr);
2360 break;
2361
2362 case nir_intrinsic_load_ubo:
2363 ntq_emit_tmu_general(c, instr, false);
2364 break;
2365
2366 case nir_intrinsic_ssbo_atomic_add:
2367 case nir_intrinsic_ssbo_atomic_imin:
2368 case nir_intrinsic_ssbo_atomic_umin:
2369 case nir_intrinsic_ssbo_atomic_imax:
2370 case nir_intrinsic_ssbo_atomic_umax:
2371 case nir_intrinsic_ssbo_atomic_and:
2372 case nir_intrinsic_ssbo_atomic_or:
2373 case nir_intrinsic_ssbo_atomic_xor:
2374 case nir_intrinsic_ssbo_atomic_exchange:
2375 case nir_intrinsic_ssbo_atomic_comp_swap:
2376 case nir_intrinsic_load_ssbo:
2377 case nir_intrinsic_store_ssbo:
2378 ntq_emit_tmu_general(c, instr, false);
2379 break;
2380
2381 case nir_intrinsic_shared_atomic_add:
2382 case nir_intrinsic_shared_atomic_imin:
2383 case nir_intrinsic_shared_atomic_umin:
2384 case nir_intrinsic_shared_atomic_imax:
2385 case nir_intrinsic_shared_atomic_umax:
2386 case nir_intrinsic_shared_atomic_and:
2387 case nir_intrinsic_shared_atomic_or:
2388 case nir_intrinsic_shared_atomic_xor:
2389 case nir_intrinsic_shared_atomic_exchange:
2390 case nir_intrinsic_shared_atomic_comp_swap:
2391 case nir_intrinsic_load_shared:
2392 case nir_intrinsic_store_shared:
2393 case nir_intrinsic_load_scratch:
2394 case nir_intrinsic_store_scratch:
2395 ntq_emit_tmu_general(c, instr, true);
2396 break;
2397
2398 case nir_intrinsic_image_load:
2399 case nir_intrinsic_image_store:
2400 case nir_intrinsic_image_atomic_add:
2401 case nir_intrinsic_image_atomic_imin:
2402 case nir_intrinsic_image_atomic_umin:
2403 case nir_intrinsic_image_atomic_imax:
2404 case nir_intrinsic_image_atomic_umax:
2405 case nir_intrinsic_image_atomic_and:
2406 case nir_intrinsic_image_atomic_or:
2407 case nir_intrinsic_image_atomic_xor:
2408 case nir_intrinsic_image_atomic_exchange:
2409 case nir_intrinsic_image_atomic_comp_swap:
2410 v3d40_vir_emit_image_load_store(c, instr);
2411 break;
2412
2413 case nir_intrinsic_get_ssbo_size:
2414 ntq_store_dest(c, &instr->dest, 0,
2415 vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
2416 nir_src_as_uint(instr->src[0])));
2417 break;
2418
2419 case nir_intrinsic_get_ubo_size:
2420 ntq_store_dest(c, &instr->dest, 0,
2421 vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
2422 nir_src_comp_as_uint(instr->src[0], 0)));
2423 break;
2424
2425 case nir_intrinsic_load_user_clip_plane:
2426 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2427 ntq_store_dest(c, &instr->dest, i,
2428 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
2429 nir_intrinsic_ucp_id(instr) *
2430 4 + i));
2431 }
2432 break;
2433
2434 case nir_intrinsic_load_viewport_x_scale:
2435 ntq_store_dest(c, &instr->dest, 0,
2436 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
2437 break;
2438
2439 case nir_intrinsic_load_viewport_y_scale:
2440 ntq_store_dest(c, &instr->dest, 0,
2441 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
2442 break;
2443
2444 case nir_intrinsic_load_viewport_z_scale:
2445 ntq_store_dest(c, &instr->dest, 0,
2446 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
2447 break;
2448
2449 case nir_intrinsic_load_viewport_z_offset:
2450 ntq_store_dest(c, &instr->dest, 0,
2451 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
2452 break;
2453
2454 case nir_intrinsic_load_line_coord:
2455 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
2456 break;
2457
2458 case nir_intrinsic_load_line_width:
2459 ntq_store_dest(c, &instr->dest, 0,
2460 vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
2461 break;
2462
2463 case nir_intrinsic_load_aa_line_width:
2464 ntq_store_dest(c, &instr->dest, 0,
2465 vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
2466 break;
2467
2468 case nir_intrinsic_load_sample_mask_in:
2469 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
2470 break;
2471
2472 case nir_intrinsic_load_helper_invocation:
2473 vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2474 ntq_store_dest(c, &instr->dest, 0,
2475 vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
2476 vir_uniform_ui(c, ~0),
2477 vir_uniform_ui(c, 0))));
2478 break;
2479
2480 case nir_intrinsic_load_front_face:
2481 /* The register contains 0 (front) or 1 (back), and we need to
2482 * turn it into a NIR bool where true means front.
2483 */
2484 ntq_store_dest(c, &instr->dest, 0,
2485 vir_ADD(c,
2486 vir_uniform_ui(c, -1),
2487 vir_REVF(c)));
2488 break;
2489
2490 case nir_intrinsic_load_base_instance:
2491 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
2492 break;
2493
2494 case nir_intrinsic_load_instance_id:
2495 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
2496 break;
2497
2498 case nir_intrinsic_load_vertex_id:
2499 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
2500 break;
2501
2502 case nir_intrinsic_load_tlb_color_v3d:
2503 vir_emit_tlb_color_read(c, instr);
2504 break;
2505
2506 case nir_intrinsic_load_input:
2507 ntq_emit_load_input(c, instr);
2508 break;
2509
2510 case nir_intrinsic_store_tlb_sample_color_v3d:
2511 ntq_emit_per_sample_color_write(c, instr);
2512 break;
2513
2514 case nir_intrinsic_store_output:
2515 ntq_emit_store_output(c, instr);
2516 break;
2517
2518 case nir_intrinsic_image_size:
2519 ntq_emit_image_size(c, instr);
2520 break;
2521
2522 case nir_intrinsic_discard:
2523 if (vir_in_nonuniform_control_flow(c)) {
2524 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
2525 V3D_QPU_PF_PUSHZ);
2526 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
2527 vir_uniform_ui(c, 0)),
2528 V3D_QPU_COND_IFA);
2529 } else {
2530 vir_SETMSF_dest(c, vir_nop_reg(),
2531 vir_uniform_ui(c, 0));
2532 }
2533 break;
2534
2535 case nir_intrinsic_discard_if: {
2536 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
2537
2538 if (vir_in_nonuniform_control_flow(c)) {
2539 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
2540 c->execute);
2541 if (cond == V3D_QPU_COND_IFA) {
2542 vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ);
2543 } else {
2544 vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ);
2545 cond = V3D_QPU_COND_IFA;
2546 }
2547 }
2548
2549 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
2550 vir_uniform_ui(c, 0)), cond);
2551
2552 break;
2553 }
2554
2555 case nir_intrinsic_memory_barrier:
2556 case nir_intrinsic_memory_barrier_buffer:
2557 case nir_intrinsic_memory_barrier_image:
2558 case nir_intrinsic_memory_barrier_shared:
2559 case nir_intrinsic_memory_barrier_tcs_patch:
2560 case nir_intrinsic_group_memory_barrier:
2561 /* We don't do any instruction scheduling of these NIR
2562 * instructions between each other, so we just need to make
2563 * sure that the TMU operations before the barrier are flushed
2564 * before the ones after the barrier. That is currently
2565 * handled by having a THRSW in each of them and a LDTMU
2566 * series or a TMUWT after.
2567 */
2568 break;
2569
2570 case nir_intrinsic_control_barrier:
2571 /* Emit a TSY op to get all invocations in the workgroup
2572 * (actually supergroup) to block until the last invocation
2573 * reaches the TSY op.
2574 */
2575 if (c->devinfo->ver >= 42) {
2576 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
2577 V3D_QPU_WADDR_SYNCB));
2578 } else {
2579 struct qinst *sync =
2580 vir_BARRIERID_dest(c,
2581 vir_reg(QFILE_MAGIC,
2582 V3D_QPU_WADDR_SYNCU));
2583 sync->uniform =
2584 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
2585 0xffffff00 |
2586 V3D_TSY_WAIT_INC_CHECK);
2587
2588 }
2589
2590 /* The blocking of a TSY op only happens at the next thread
2591 * switch. No texturing may be outstanding at the time of a
2592 * TSY blocking operation.
2593 */
2594 vir_emit_thrsw(c);
2595 break;
2596
2597 case nir_intrinsic_load_num_work_groups:
2598 for (int i = 0; i < 3; i++) {
2599 ntq_store_dest(c, &instr->dest, i,
2600 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
2601 i));
2602 }
2603 break;
2604
2605 case nir_intrinsic_load_local_invocation_index:
2606 ntq_store_dest(c, &instr->dest, 0,
2607 vir_SHR(c, c->cs_payload[1],
2608 vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
2609 break;
2610
2611 case nir_intrinsic_load_work_group_id:
2612 ntq_store_dest(c, &instr->dest, 0,
2613 vir_AND(c, c->cs_payload[0],
2614 vir_uniform_ui(c, 0xffff)));
2615 ntq_store_dest(c, &instr->dest, 1,
2616 vir_SHR(c, c->cs_payload[0],
2617 vir_uniform_ui(c, 16)));
2618 ntq_store_dest(c, &instr->dest, 2,
2619 vir_AND(c, c->cs_payload[1],
2620 vir_uniform_ui(c, 0xffff)));
2621 break;
2622
2623 case nir_intrinsic_load_subgroup_id:
2624 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
2625 break;
2626
2627 case nir_intrinsic_load_per_vertex_input: {
2628 /* col: vertex index, row = varying index */
2629 struct qreg col = ntq_get_src(c, instr->src[0], 0);
2630 uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
2631 nir_intrinsic_component(instr);
2632 for (int i = 0; i < instr->num_components; i++) {
2633 struct qreg row = vir_uniform_ui(c, row_idx++);
2634 ntq_store_dest(c, &instr->dest, i,
2635 vir_LDVPMG_IN(c, row, col));
2636 }
2637 break;
2638 }
2639
2640 case nir_intrinsic_emit_vertex:
2641 case nir_intrinsic_end_primitive:
2642 unreachable("Should have been lowered in v3d_nir_lower_io");
2643 break;
2644
2645 case nir_intrinsic_load_primitive_id: {
2646 /* gl_PrimitiveIdIn is written by the GBG in the first word of
2647 * VPM output header. According to docs, we should read this
2648 * using ldvpm(v,d)_in (See Table 71).
2649 */
2650 ntq_store_dest(c, &instr->dest, 0,
2651 vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
2652 break;
2653 }
2654
2655 case nir_intrinsic_load_invocation_id:
2656 ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
2657 break;
2658
2659 case nir_intrinsic_load_fb_layers_v3d:
2660 ntq_store_dest(c, &instr->dest, 0,
2661 vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
2662 break;
2663
2664 case nir_intrinsic_load_sample_id:
2665 ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
2666 break;
2667
2668 case nir_intrinsic_load_sample_pos:
2669 ntq_store_dest(c, &instr->dest, 0,
2670 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
2671 ntq_store_dest(c, &instr->dest, 1,
2672 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
2673 break;
2674
2675 case nir_intrinsic_load_barycentric_at_offset:
2676 ntq_store_dest(c, &instr->dest, 0,
2677 vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
2678 ntq_store_dest(c, &instr->dest, 1,
2679 vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
2680 break;
2681
2682 case nir_intrinsic_load_barycentric_pixel:
2683 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
2684 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
2685 break;
2686
2687 case nir_intrinsic_load_barycentric_at_sample: {
2688 if (!c->fs_key->msaa) {
2689 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
2690 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
2691 return;
2692 }
2693
2694 struct qreg offset_x, offset_y;
2695 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
2696 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2697
2698 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
2699 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
2700 break;
2701 }
2702
2703 case nir_intrinsic_load_barycentric_sample: {
2704 struct qreg offset_x =
2705 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2706 struct qreg offset_y =
2707 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2708
2709 ntq_store_dest(c, &instr->dest, 0,
2710 vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
2711 ntq_store_dest(c, &instr->dest, 1,
2712 vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
2713 break;
2714 }
2715
2716 case nir_intrinsic_load_barycentric_centroid: {
2717 struct qreg offset_x, offset_y;
2718 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
2719 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
2720 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
2721 break;
2722 }
2723
2724 case nir_intrinsic_load_interpolated_input: {
2725 assert(nir_src_is_const(instr->src[1]));
2726 const uint32_t offset = nir_src_as_uint(instr->src[1]);
2727
2728 for (int i = 0; i < instr->num_components; i++) {
2729 const uint32_t input_idx =
2730 (nir_intrinsic_base(instr) + offset) * 4 +
2731 nir_intrinsic_component(instr) + i;
2732
2733 /* If we are not in MSAA or if we are not interpolating
2734 * a user varying, just return the pre-computed
2735 * interpolated input.
2736 */
2737 if (!c->fs_key->msaa ||
2738 c->interp[input_idx].vp.file == QFILE_NULL) {
2739 ntq_store_dest(c, &instr->dest, i,
2740 vir_MOV(c, c->inputs[input_idx]));
2741 continue;
2742 }
2743
2744 /* Otherwise compute interpolation at the specified
2745 * offset.
2746 */
2747 struct qreg p = c->interp[input_idx].vp;
2748 struct qreg C = c->interp[input_idx].C;
2749 unsigned interp_mode = c->interp[input_idx].mode;
2750
2751 struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
2752 struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
2753
2754 struct qreg result =
2755 ntq_emit_load_interpolated_input(c, p, C,
2756 offset_x, offset_y,
2757 interp_mode);
2758 ntq_store_dest(c, &instr->dest, i, result);
2759 }
2760 break;
2761 }
2762
2763 default:
2764 fprintf(stderr, "Unknown intrinsic: ");
2765 nir_print_instr(&instr->instr, stderr);
2766 fprintf(stderr, "\n");
2767 break;
2768 }
2769 }
2770
2771 /* Clears (activates) the execute flags for any channels whose jump target
2772 * matches this block.
2773 *
2774 * XXX perf: Could we be using flpush/flpop somehow for our execution channel
2775 * enabling?
2776 *
2777 * XXX perf: For uniform control flow, we should be able to skip c->execute
2778 * handling entirely.
2779 */
2780 static void
ntq_activate_execute_for_block(struct v3d_compile * c)2781 ntq_activate_execute_for_block(struct v3d_compile *c)
2782 {
2783 vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
2784 c->execute, vir_uniform_ui(c, c->cur_block->index)),
2785 V3D_QPU_PF_PUSHZ);
2786
2787 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
2788 }
2789
2790 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)2791 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
2792 {
2793 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
2794 bool empty_else_block =
2795 (nir_else_block == nir_if_last_else_block(if_stmt) &&
2796 exec_list_is_empty(&nir_else_block->instr_list));
2797
2798 struct qblock *then_block = vir_new_block(c);
2799 struct qblock *after_block = vir_new_block(c);
2800 struct qblock *else_block;
2801 if (empty_else_block)
2802 else_block = after_block;
2803 else
2804 else_block = vir_new_block(c);
2805
2806 /* Set up the flags for the IF condition (taking the THEN branch). */
2807 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
2808
2809 /* Jump to ELSE. */
2810 vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
2811 V3D_QPU_BRANCH_COND_ALLNA :
2812 V3D_QPU_BRANCH_COND_ALLA);
2813 vir_link_blocks(c->cur_block, else_block);
2814 vir_link_blocks(c->cur_block, then_block);
2815
2816 /* Process the THEN block. */
2817 vir_set_emit_block(c, then_block);
2818 ntq_emit_cf_list(c, &if_stmt->then_list);
2819
2820 if (!empty_else_block) {
2821 /* At the end of the THEN block, jump to ENDIF */
2822 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
2823 vir_link_blocks(c->cur_block, after_block);
2824
2825 /* Emit the else block. */
2826 vir_set_emit_block(c, else_block);
2827 ntq_emit_cf_list(c, &if_stmt->else_list);
2828 }
2829
2830 vir_link_blocks(c->cur_block, after_block);
2831
2832 vir_set_emit_block(c, after_block);
2833 }
2834
2835 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)2836 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
2837 {
2838 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
2839 bool empty_else_block =
2840 (nir_else_block == nir_if_last_else_block(if_stmt) &&
2841 exec_list_is_empty(&nir_else_block->instr_list));
2842
2843 struct qblock *then_block = vir_new_block(c);
2844 struct qblock *after_block = vir_new_block(c);
2845 struct qblock *else_block;
2846 if (empty_else_block)
2847 else_block = after_block;
2848 else
2849 else_block = vir_new_block(c);
2850
2851 bool was_uniform_control_flow = false;
2852 if (!vir_in_nonuniform_control_flow(c)) {
2853 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
2854 was_uniform_control_flow = true;
2855 }
2856
2857 /* Set up the flags for the IF condition (taking the THEN branch). */
2858 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
2859
2860 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
2861 * was previously active (execute Z) for updating the exec flags.
2862 */
2863 if (was_uniform_control_flow) {
2864 cond = v3d_qpu_cond_invert(cond);
2865 } else {
2866 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
2867 if (cond == V3D_QPU_COND_IFA) {
2868 vir_set_uf(inst, V3D_QPU_UF_NORNZ);
2869 } else {
2870 vir_set_uf(inst, V3D_QPU_UF_ANDZ);
2871 cond = V3D_QPU_COND_IFA;
2872 }
2873 }
2874
2875 vir_MOV_cond(c, cond,
2876 c->execute,
2877 vir_uniform_ui(c, else_block->index));
2878
2879 /* Jump to ELSE if nothing is active for THEN, otherwise fall
2880 * through.
2881 */
2882 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
2883 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
2884 vir_link_blocks(c->cur_block, else_block);
2885 vir_link_blocks(c->cur_block, then_block);
2886
2887 /* Process the THEN block. */
2888 vir_set_emit_block(c, then_block);
2889 ntq_emit_cf_list(c, &if_stmt->then_list);
2890
2891 if (!empty_else_block) {
2892 /* Handle the end of the THEN block. First, all currently
2893 * active channels update their execute flags to point to
2894 * ENDIF
2895 */
2896 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
2897 V3D_QPU_PF_PUSHZ);
2898 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2899 vir_uniform_ui(c, after_block->index));
2900
2901 /* If everything points at ENDIF, then jump there immediately. */
2902 vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
2903 c->execute,
2904 vir_uniform_ui(c, after_block->index)),
2905 V3D_QPU_PF_PUSHZ);
2906 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
2907 vir_link_blocks(c->cur_block, after_block);
2908 vir_link_blocks(c->cur_block, else_block);
2909
2910 vir_set_emit_block(c, else_block);
2911 ntq_activate_execute_for_block(c);
2912 ntq_emit_cf_list(c, &if_stmt->else_list);
2913 }
2914
2915 vir_link_blocks(c->cur_block, after_block);
2916
2917 vir_set_emit_block(c, after_block);
2918 if (was_uniform_control_flow)
2919 c->execute = c->undef;
2920 else
2921 ntq_activate_execute_for_block(c);
2922 }
2923
2924 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)2925 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
2926 {
2927 bool was_in_control_flow = c->in_control_flow;
2928 c->in_control_flow = true;
2929 if (!vir_in_nonuniform_control_flow(c) &&
2930 nir_src_is_dynamically_uniform(nif->condition)) {
2931 ntq_emit_uniform_if(c, nif);
2932 } else {
2933 ntq_emit_nonuniform_if(c, nif);
2934 }
2935 c->in_control_flow = was_in_control_flow;
2936 }
2937
2938 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)2939 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
2940 {
2941 switch (jump->type) {
2942 case nir_jump_break:
2943 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
2944 V3D_QPU_PF_PUSHZ);
2945 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2946 vir_uniform_ui(c, c->loop_break_block->index));
2947 break;
2948
2949 case nir_jump_continue:
2950 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
2951 V3D_QPU_PF_PUSHZ);
2952 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
2953 vir_uniform_ui(c, c->loop_cont_block->index));
2954 break;
2955
2956 case nir_jump_return:
2957 unreachable("All returns shouold be lowered\n");
2958 break;
2959
2960 case nir_jump_goto:
2961 case nir_jump_goto_if:
2962 unreachable("not supported\n");
2963 break;
2964 }
2965 }
2966
2967 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)2968 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
2969 {
2970 switch (instr->type) {
2971 case nir_instr_type_alu:
2972 ntq_emit_alu(c, nir_instr_as_alu(instr));
2973 break;
2974
2975 case nir_instr_type_intrinsic:
2976 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2977 break;
2978
2979 case nir_instr_type_load_const:
2980 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2981 break;
2982
2983 case nir_instr_type_ssa_undef:
2984 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2985 break;
2986
2987 case nir_instr_type_tex:
2988 ntq_emit_tex(c, nir_instr_as_tex(instr));
2989 break;
2990
2991 case nir_instr_type_jump:
2992 ntq_emit_jump(c, nir_instr_as_jump(instr));
2993 break;
2994
2995 default:
2996 fprintf(stderr, "Unknown NIR instr type: ");
2997 nir_print_instr(instr, stderr);
2998 fprintf(stderr, "\n");
2999 abort();
3000 }
3001 }
3002
3003 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)3004 ntq_emit_block(struct v3d_compile *c, nir_block *block)
3005 {
3006 nir_foreach_instr(instr, block) {
3007 ntq_emit_instr(c, instr);
3008 }
3009 }
3010
3011 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
3012
3013 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)3014 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
3015 {
3016 bool was_in_control_flow = c->in_control_flow;
3017 c->in_control_flow = true;
3018
3019 bool was_uniform_control_flow = false;
3020 if (!vir_in_nonuniform_control_flow(c)) {
3021 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3022 was_uniform_control_flow = true;
3023 }
3024
3025 struct qblock *save_loop_cont_block = c->loop_cont_block;
3026 struct qblock *save_loop_break_block = c->loop_break_block;
3027
3028 c->loop_cont_block = vir_new_block(c);
3029 c->loop_break_block = vir_new_block(c);
3030
3031 vir_link_blocks(c->cur_block, c->loop_cont_block);
3032 vir_set_emit_block(c, c->loop_cont_block);
3033 ntq_activate_execute_for_block(c);
3034
3035 ntq_emit_cf_list(c, &loop->body);
3036
3037 /* Re-enable any previous continues now, so our ANYA check below
3038 * works.
3039 *
3040 * XXX: Use the .ORZ flags update, instead.
3041 */
3042 vir_set_pf(vir_XOR_dest(c,
3043 vir_nop_reg(),
3044 c->execute,
3045 vir_uniform_ui(c, c->loop_cont_block->index)),
3046 V3D_QPU_PF_PUSHZ);
3047 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3048
3049 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3050
3051 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
3052 /* Pixels that were not dispatched or have been discarded should not
3053 * contribute to looping again.
3054 */
3055 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3056 vir_link_blocks(c->cur_block, c->loop_cont_block);
3057 vir_link_blocks(c->cur_block, c->loop_break_block);
3058
3059 vir_set_emit_block(c, c->loop_break_block);
3060 if (was_uniform_control_flow)
3061 c->execute = c->undef;
3062 else
3063 ntq_activate_execute_for_block(c);
3064
3065 c->loop_break_block = save_loop_break_block;
3066 c->loop_cont_block = save_loop_cont_block;
3067
3068 c->loops++;
3069
3070 c->in_control_flow = was_in_control_flow;
3071 }
3072
3073 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)3074 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
3075 {
3076 fprintf(stderr, "FUNCTIONS not handled.\n");
3077 abort();
3078 }
3079
3080 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)3081 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
3082 {
3083 foreach_list_typed(nir_cf_node, node, node, list) {
3084 switch (node->type) {
3085 case nir_cf_node_block:
3086 ntq_emit_block(c, nir_cf_node_as_block(node));
3087 break;
3088
3089 case nir_cf_node_if:
3090 ntq_emit_if(c, nir_cf_node_as_if(node));
3091 break;
3092
3093 case nir_cf_node_loop:
3094 ntq_emit_loop(c, nir_cf_node_as_loop(node));
3095 break;
3096
3097 case nir_cf_node_function:
3098 ntq_emit_function(c, nir_cf_node_as_function(node));
3099 break;
3100
3101 default:
3102 fprintf(stderr, "Unknown NIR node type\n");
3103 abort();
3104 }
3105 }
3106 }
3107
3108 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)3109 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
3110 {
3111 ntq_setup_registers(c, &impl->registers);
3112 ntq_emit_cf_list(c, &impl->body);
3113 }
3114
3115 static void
nir_to_vir(struct v3d_compile * c)3116 nir_to_vir(struct v3d_compile *c)
3117 {
3118 switch (c->s->info.stage) {
3119 case MESA_SHADER_FRAGMENT:
3120 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
3121 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
3122 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
3123
3124 /* V3D 4.x can disable implicit point coordinate varyings if
3125 * they are not used.
3126 */
3127 if (c->fs_key->is_points &&
3128 (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
3129 c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3130 c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
3131 c->uses_implicit_point_line_varyings = true;
3132 } else if (c->fs_key->is_lines &&
3133 (c->devinfo->ver < 40 ||
3134 BITSET_TEST(c->s->info.system_values_read,
3135 SYSTEM_VALUE_LINE_COORD))) {
3136 c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3137 c->uses_implicit_point_line_varyings = true;
3138 }
3139
3140 c->force_per_sample_msaa =
3141 c->s->info.fs.uses_sample_qualifier ||
3142 BITSET_TEST(c->s->info.system_values_read,
3143 SYSTEM_VALUE_SAMPLE_ID) ||
3144 BITSET_TEST(c->s->info.system_values_read,
3145 SYSTEM_VALUE_SAMPLE_POS);
3146 break;
3147 case MESA_SHADER_COMPUTE:
3148 /* Set up the TSO for barriers, assuming we do some. */
3149 if (c->devinfo->ver < 42) {
3150 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3151 V3D_QPU_WADDR_SYNC));
3152 }
3153
3154 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
3155 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
3156
3157 /* Set up the division between gl_LocalInvocationIndex and
3158 * wg_in_mem in the payload reg.
3159 */
3160 int wg_size = (c->s->info.cs.local_size[0] *
3161 c->s->info.cs.local_size[1] *
3162 c->s->info.cs.local_size[2]);
3163 c->local_invocation_index_bits =
3164 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
3165 assert(c->local_invocation_index_bits <= 8);
3166
3167 if (c->s->info.cs.shared_size) {
3168 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
3169 vir_uniform_ui(c, 16));
3170 if (c->s->info.cs.local_size[0] != 1 ||
3171 c->s->info.cs.local_size[1] != 1 ||
3172 c->s->info.cs.local_size[2] != 1) {
3173 int wg_bits = (16 -
3174 c->local_invocation_index_bits);
3175 int wg_mask = (1 << wg_bits) - 1;
3176 wg_in_mem = vir_AND(c, wg_in_mem,
3177 vir_uniform_ui(c, wg_mask));
3178 }
3179 struct qreg shared_per_wg =
3180 vir_uniform_ui(c, c->s->info.cs.shared_size);
3181
3182 c->cs_shared_offset =
3183 vir_ADD(c,
3184 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
3185 vir_UMUL(c, wg_in_mem, shared_per_wg));
3186 }
3187 break;
3188 default:
3189 break;
3190 }
3191
3192 if (c->s->scratch_size) {
3193 v3d_setup_spill_base(c);
3194 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
3195 }
3196
3197 switch (c->s->info.stage) {
3198 case MESA_SHADER_VERTEX:
3199 ntq_setup_vs_inputs(c);
3200 break;
3201 case MESA_SHADER_GEOMETRY:
3202 ntq_setup_gs_inputs(c);
3203 break;
3204 case MESA_SHADER_FRAGMENT:
3205 ntq_setup_fs_inputs(c);
3206 break;
3207 case MESA_SHADER_COMPUTE:
3208 break;
3209 default:
3210 unreachable("unsupported shader stage");
3211 }
3212
3213 ntq_setup_outputs(c);
3214
3215 /* Find the main function and emit the body. */
3216 nir_foreach_function(function, c->s) {
3217 assert(strcmp(function->name, "main") == 0);
3218 assert(function->impl);
3219 ntq_emit_impl(c, function->impl);
3220 }
3221 }
3222
3223 const nir_shader_compiler_options v3d_nir_options = {
3224 .lower_all_io_to_temps = true,
3225 .lower_extract_byte = true,
3226 .lower_extract_word = true,
3227 .lower_bitfield_insert_to_shifts = true,
3228 .lower_bitfield_extract_to_shifts = true,
3229 .lower_bitfield_reverse = true,
3230 .lower_bit_count = true,
3231 .lower_cs_local_id_from_index = true,
3232 .lower_ffract = true,
3233 .lower_fmod = true,
3234 .lower_pack_unorm_2x16 = true,
3235 .lower_pack_snorm_2x16 = true,
3236 .lower_pack_unorm_4x8 = true,
3237 .lower_pack_snorm_4x8 = true,
3238 .lower_unpack_unorm_4x8 = true,
3239 .lower_unpack_snorm_4x8 = true,
3240 .lower_pack_half_2x16 = true,
3241 .lower_unpack_half_2x16 = true,
3242 .lower_fdiv = true,
3243 .lower_find_lsb = true,
3244 .lower_ffma16 = true,
3245 .lower_ffma32 = true,
3246 .lower_ffma64 = true,
3247 .lower_flrp32 = true,
3248 .lower_fpow = true,
3249 .lower_fsat = true,
3250 .lower_fsqrt = true,
3251 .lower_ifind_msb = true,
3252 .lower_isign = true,
3253 .lower_ldexp = true,
3254 .lower_mul_high = true,
3255 .lower_wpos_pntc = true,
3256 .lower_rotate = true,
3257 .lower_to_scalar = true,
3258 };
3259
3260 /**
3261 * When demoting a shader down to single-threaded, removes the THRSW
3262 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
3263 * program end).
3264 */
3265 static void
vir_remove_thrsw(struct v3d_compile * c)3266 vir_remove_thrsw(struct v3d_compile *c)
3267 {
3268 vir_for_each_block(block, c) {
3269 vir_for_each_inst_safe(inst, block) {
3270 if (inst->qpu.sig.thrsw)
3271 vir_remove_instruction(c, inst);
3272 }
3273 }
3274
3275 c->last_thrsw = NULL;
3276 }
3277
3278 void
vir_emit_last_thrsw(struct v3d_compile * c)3279 vir_emit_last_thrsw(struct v3d_compile *c)
3280 {
3281 /* On V3D before 4.1, we need a TMU op to be outstanding when thread
3282 * switching, so disable threads if we didn't do any TMU ops (each of
3283 * which would have emitted a THRSW).
3284 */
3285 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
3286 c->threads = 1;
3287 if (c->last_thrsw)
3288 vir_remove_thrsw(c);
3289 return;
3290 }
3291
3292 /* If we're threaded and the last THRSW was in conditional code, then
3293 * we need to emit another one so that we can flag it as the last
3294 * thrsw.
3295 */
3296 if (c->last_thrsw && !c->last_thrsw_at_top_level) {
3297 assert(c->devinfo->ver >= 41);
3298 vir_emit_thrsw(c);
3299 }
3300
3301 /* If we're threaded, then we need to mark the last THRSW instruction
3302 * so we can emit a pair of them at QPU emit time.
3303 *
3304 * For V3D 4.x, we can spawn the non-fragment shaders already in the
3305 * post-last-THRSW state, so we can skip this.
3306 */
3307 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
3308 assert(c->devinfo->ver >= 41);
3309 vir_emit_thrsw(c);
3310 }
3311
3312 if (c->last_thrsw)
3313 c->last_thrsw->is_last_thrsw = true;
3314 }
3315
3316 /* There's a flag in the shader for "center W is needed for reasons other than
3317 * non-centroid varyings", so we just walk the program after VIR optimization
3318 * to see if it's used. It should be harmless to set even if we only use
3319 * center W for varyings.
3320 */
3321 static void
vir_check_payload_w(struct v3d_compile * c)3322 vir_check_payload_w(struct v3d_compile *c)
3323 {
3324 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
3325 return;
3326
3327 vir_for_each_inst_inorder(inst, c) {
3328 for (int i = 0; i < vir_get_nsrc(inst); i++) {
3329 if (inst->src[i].file == QFILE_REG &&
3330 inst->src[i].index == 0) {
3331 c->uses_center_w = true;
3332 return;
3333 }
3334 }
3335 }
3336 }
3337
3338 void
v3d_nir_to_vir(struct v3d_compile * c)3339 v3d_nir_to_vir(struct v3d_compile *c)
3340 {
3341 if (V3D_DEBUG & (V3D_DEBUG_NIR |
3342 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3343 fprintf(stderr, "%s prog %d/%d NIR:\n",
3344 vir_get_stage_name(c),
3345 c->program_id, c->variant_id);
3346 nir_print_shader(c->s, stderr);
3347 }
3348
3349 nir_to_vir(c);
3350
3351 /* Emit the last THRSW before STVPM and TLB writes. */
3352 vir_emit_last_thrsw(c);
3353
3354 switch (c->s->info.stage) {
3355 case MESA_SHADER_FRAGMENT:
3356 emit_frag_end(c);
3357 break;
3358 case MESA_SHADER_GEOMETRY:
3359 emit_geom_end(c);
3360 break;
3361 case MESA_SHADER_VERTEX:
3362 emit_vert_end(c);
3363 break;
3364 case MESA_SHADER_COMPUTE:
3365 break;
3366 default:
3367 unreachable("bad stage");
3368 }
3369
3370 if (V3D_DEBUG & (V3D_DEBUG_VIR |
3371 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3372 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
3373 vir_get_stage_name(c),
3374 c->program_id, c->variant_id);
3375 vir_dump(c);
3376 fprintf(stderr, "\n");
3377 }
3378
3379 vir_optimize(c);
3380
3381 vir_check_payload_w(c);
3382
3383 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
3384 * We used that on that platform to pipeline TMU writes and reduce the
3385 * number of thread switches, as well as try (mostly successfully) to
3386 * reduce maximum register pressure to allow more threads. We should
3387 * do something of that sort for V3D -- either instruction scheduling
3388 * here, or delay the the THRSW and LDTMUs from our texture
3389 * instructions until the results are needed.
3390 */
3391
3392 if (V3D_DEBUG & (V3D_DEBUG_VIR |
3393 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3394 fprintf(stderr, "%s prog %d/%d VIR:\n",
3395 vir_get_stage_name(c),
3396 c->program_id, c->variant_id);
3397 vir_dump(c);
3398 fprintf(stderr, "\n");
3399 }
3400
3401 /* Attempt to allocate registers for the temporaries. If we fail,
3402 * reduce thread count and try again.
3403 */
3404 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
3405 struct qpu_reg *temp_registers;
3406 while (true) {
3407 bool spilled;
3408 temp_registers = v3d_register_allocate(c, &spilled);
3409 if (spilled)
3410 continue;
3411
3412 if (temp_registers)
3413 break;
3414
3415 if (c->threads == min_threads &&
3416 (V3D_DEBUG & V3D_DEBUG_RA)) {
3417 fprintf(stderr,
3418 "Failed to register allocate using %s\n",
3419 c->fallback_scheduler ? "the fallback scheduler:" :
3420 "the normal scheduler: \n");
3421
3422 vir_dump(c);
3423
3424 char *shaderdb;
3425 int ret = v3d_shaderdb_dump(c, &shaderdb);
3426 if (ret > 0) {
3427 fprintf(stderr, "%s\n", shaderdb);
3428 free(shaderdb);
3429 }
3430 }
3431
3432 if (c->threads == min_threads) {
3433 if (c->fallback_scheduler) {
3434 fprintf(stderr,
3435 "Failed to register allocate at %d "
3436 "threads with any strategy.\n",
3437 c->threads);
3438 }
3439 c->compilation_result =
3440 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
3441 return;
3442 }
3443
3444 c->spill_count = 0;
3445 c->threads /= 2;
3446
3447 if (c->threads == 1)
3448 vir_remove_thrsw(c);
3449 }
3450
3451 if (c->spills &&
3452 (V3D_DEBUG & (V3D_DEBUG_VIR |
3453 v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
3454 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
3455 vir_get_stage_name(c),
3456 c->program_id, c->variant_id);
3457 vir_dump(c);
3458 fprintf(stderr, "\n");
3459 }
3460
3461 v3d_vir_to_qpu(c, temp_registers);
3462 }
3463