1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v42_pack.h"
42
43 #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
53
54 #define V3D_TSY_SET_QUORUM 0
55 #define V3D_TSY_INC_WAITERS 1
56 #define V3D_TSY_DEC_WAITERS 2
57 #define V3D_TSY_INC_QUORUM 3
58 #define V3D_TSY_DEC_QUORUM 4
59 #define V3D_TSY_FREE_ALL 5
60 #define V3D_TSY_RELEASE 6
61 #define V3D_TSY_ACQUIRE 7
62 #define V3D_TSY_WAIT 8
63 #define V3D_TSY_WAIT_INC 9
64 #define V3D_TSY_WAIT_CHECK 10
65 #define V3D_TSY_WAIT_INC_CHECK 11
66 #define V3D_TSY_WAIT_CV 12
67 #define V3D_TSY_INC_SEMAPHORE 13
68 #define V3D_TSY_DEC_SEMAPHORE 14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70
71 enum v3d_tmu_op_type
72 {
73 V3D_TMU_OP_TYPE_REGULAR,
74 V3D_TMU_OP_TYPE_ATOMIC,
75 V3D_TMU_OP_TYPE_CACHE
76 };
77
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81 switch(tmu_op) {
82 case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83 case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84 case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85 case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86 case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88 case V3D_TMU_OP_WRITE_UMAX:
89 case V3D_TMU_OP_WRITE_SMIN:
90 case V3D_TMU_OP_WRITE_SMAX:
91 assert(is_write);
92 FALLTHROUGH;
93 case V3D_TMU_OP_WRITE_AND_READ_INC:
94 case V3D_TMU_OP_WRITE_OR_READ_DEC:
95 case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96 return V3D_TMU_OP_TYPE_ATOMIC;
97 case V3D_TMU_OP_REGULAR:
98 return V3D_TMU_OP_TYPE_REGULAR;
99
100 default:
101 unreachable("Unknown tmu_op\n");
102 }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109 struct qreg **regs,
110 uint32_t *size,
111 uint32_t decl_size)
112 {
113 if (*size >= decl_size)
114 return;
115
116 uint32_t old_size = *size;
117 *size = MAX2(*size * 2, decl_size);
118 *regs = reralloc(c, *regs, struct qreg, *size);
119 if (!*regs) {
120 fprintf(stderr, "Malloc failure\n");
121 abort();
122 }
123
124 for (uint32_t i = old_size; i < *size; i++)
125 (*regs)[i] = c->undef;
126 }
127
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130 struct v3d_interp_input **regs,
131 uint32_t *size,
132 uint32_t decl_size)
133 {
134 if (*size >= decl_size)
135 return;
136
137 uint32_t old_size = *size;
138 *size = MAX2(*size * 2, decl_size);
139 *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140 if (!*regs) {
141 fprintf(stderr, "Malloc failure\n");
142 abort();
143 }
144
145 for (uint32_t i = old_size; i < *size; i++) {
146 (*regs)[i].vp = c->undef;
147 (*regs)[i].C = c->undef;
148 }
149 }
150
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154 if (c->threads == 1)
155 return;
156
157 /* Always thread switch after each texture operation for now.
158 *
159 * We could do better by batching a bunch of texture fetches up and
160 * then doing one thread switch and collecting all their results
161 * afterward.
162 */
163 c->last_thrsw = vir_NOP(c);
164 c->last_thrsw->qpu.sig.thrsw = true;
165 c->last_thrsw_at_top_level = !c->in_control_flow;
166
167 /* We need to lock the scoreboard before any tlb access happens. If this
168 * thread switch comes after we have emitted a tlb load, then it means
169 * that we can't lock on the last thread switch any more.
170 */
171 if (c->emitted_tlb_load)
172 c->lock_scoreboard_on_first_thrsw = true;
173 }
174
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178 if (nir_src_is_const(instr->src[src])) {
179 int64_t add_val = nir_src_as_int(instr->src[src]);
180 if (add_val == 1)
181 return V3D_TMU_OP_WRITE_AND_READ_INC;
182 else if (add_val == -1)
183 return V3D_TMU_OP_WRITE_OR_READ_DEC;
184 }
185
186 return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188
189 static uint32_t
v3d_general_tmu_op_for_atomic(nir_intrinsic_instr * instr)190 v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
191 {
192 nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
193 switch (atomic_op) {
194 case nir_atomic_op_iadd:
195 return instr->intrinsic == nir_intrinsic_ssbo_atomic ?
196 v3d_get_op_for_atomic_add(instr, 2) :
197 v3d_get_op_for_atomic_add(instr, 1);
198 case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
199 case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
200 case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
201 case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
202 case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
203 case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
204 case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
205 case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
206 case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
207 default: unreachable("unknown atomic op");
208 }
209 }
210
211 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)212 v3d_general_tmu_op(nir_intrinsic_instr *instr)
213 {
214 switch (instr->intrinsic) {
215 case nir_intrinsic_load_ssbo:
216 case nir_intrinsic_load_ubo:
217 case nir_intrinsic_load_uniform:
218 case nir_intrinsic_load_shared:
219 case nir_intrinsic_load_scratch:
220 case nir_intrinsic_load_global:
221 case nir_intrinsic_load_global_constant:
222 case nir_intrinsic_store_ssbo:
223 case nir_intrinsic_store_shared:
224 case nir_intrinsic_store_scratch:
225 case nir_intrinsic_store_global:
226 return V3D_TMU_OP_REGULAR;
227
228 case nir_intrinsic_ssbo_atomic:
229 case nir_intrinsic_ssbo_atomic_swap:
230 case nir_intrinsic_shared_atomic:
231 case nir_intrinsic_shared_atomic_swap:
232 case nir_intrinsic_global_atomic:
233 case nir_intrinsic_global_atomic_swap:
234 return v3d_general_tmu_op_for_atomic(instr);
235
236 default:
237 unreachable("unknown intrinsic op");
238 }
239 }
240
241 /**
242 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
243 * would overflow the Output TMU fifo.
244 *
245 * It is not allowed to overflow the Output fifo, however, we can overflow
246 * Input and Config fifos. Doing that makes the shader stall, but only for as
247 * long as it needs to be able to continue so it is better for pipelining to
248 * let the QPU stall on these if needed than trying to emit TMU flushes in the
249 * driver.
250 */
251 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)252 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
253 {
254 if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
255 return true;
256
257 return components > 0 &&
258 c->tmu.output_fifo_size + components > 16 / c->threads;
259 }
260
261 /**
262 * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
263 * popping all TMU fifo entries.
264 */
265 void
ntq_flush_tmu(struct v3d_compile * c)266 ntq_flush_tmu(struct v3d_compile *c)
267 {
268 if (c->tmu.flush_count == 0)
269 return;
270
271 vir_emit_thrsw(c);
272
273 bool emitted_tmuwt = false;
274 for (int i = 0; i < c->tmu.flush_count; i++) {
275 if (c->tmu.flush[i].component_mask > 0) {
276 nir_def *def = c->tmu.flush[i].def;
277 assert(def);
278
279 for (int j = 0; j < 4; j++) {
280 if (c->tmu.flush[i].component_mask & (1 << j)) {
281 ntq_store_def(c, def, j,
282 vir_MOV(c, vir_LDTMU(c)));
283 }
284 }
285 } else if (!emitted_tmuwt) {
286 vir_TMUWT(c);
287 emitted_tmuwt = true;
288 }
289 }
290
291 c->tmu.output_fifo_size = 0;
292 c->tmu.flush_count = 0;
293 _mesa_set_clear(c->tmu.outstanding_regs, NULL);
294 }
295
296 /**
297 * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
298 * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
299 * and more specifically, the output fifo, since that can't stall.
300 */
301 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_def * def,uint32_t component_mask)302 ntq_add_pending_tmu_flush(struct v3d_compile *c,
303 nir_def *def,
304 uint32_t component_mask)
305 {
306 const uint32_t num_components = util_bitcount(component_mask);
307 assert(!ntq_tmu_fifo_overflow(c, num_components));
308
309 if (num_components > 0) {
310 c->tmu.output_fifo_size += num_components;
311
312 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
313 if (store != NULL) {
314 nir_def *reg = store->src[1].ssa;
315 _mesa_set_add(c->tmu.outstanding_regs, reg);
316 }
317 }
318
319 c->tmu.flush[c->tmu.flush_count].def = def;
320 c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
321 c->tmu.flush_count++;
322 c->tmu.total_count++;
323
324 if (c->disable_tmu_pipelining)
325 ntq_flush_tmu(c);
326 else if (c->tmu.flush_count > 1)
327 c->pipelined_any_tmu = true;
328 }
329
330 enum emit_mode {
331 MODE_COUNT = 0,
332 MODE_EMIT,
333 MODE_LAST,
334 };
335
336 /**
337 * For a TMU general store instruction:
338 *
339 * In MODE_COUNT mode, records the number of TMU writes required and flushes
340 * any outstanding TMU operations the instruction depends on, but it doesn't
341 * emit any actual register writes.
342 *
343 * In MODE_EMIT mode, emits the data register writes required by the
344 * instruction.
345 */
346 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)347 emit_tmu_general_store_writes(struct v3d_compile *c,
348 enum emit_mode mode,
349 nir_intrinsic_instr *instr,
350 uint32_t base_const_offset,
351 uint32_t *writemask,
352 uint32_t *const_offset,
353 uint32_t *type_size,
354 uint32_t *tmu_writes)
355 {
356 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
357
358 /* Find the first set of consecutive components that
359 * are enabled in the writemask and emit the TMUD
360 * instructions for them.
361 */
362 assert(*writemask != 0);
363 uint32_t first_component = ffs(*writemask) - 1;
364 uint32_t last_component = first_component;
365 while (*writemask & BITFIELD_BIT(last_component + 1))
366 last_component++;
367
368 assert(first_component <= last_component &&
369 last_component < instr->num_components);
370
371 for (int i = first_component; i <= last_component; i++) {
372 struct qreg data = ntq_get_src(c, instr->src[0], i);
373 if (mode == MODE_COUNT)
374 (*tmu_writes)++;
375 else
376 vir_MOV_dest(c, tmud, data);
377 }
378
379 if (mode == MODE_EMIT) {
380 /* Update the offset for the TMU write based on the
381 * the first component we are writing.
382 */
383 *type_size = nir_src_bit_size(instr->src[0]) / 8;
384 *const_offset =
385 base_const_offset + first_component * (*type_size);
386
387 /* Clear these components from the writemask */
388 uint32_t written_mask =
389 BITFIELD_RANGE(first_component, *tmu_writes);
390 (*writemask) &= ~written_mask;
391 }
392 }
393
394 /**
395 * For a TMU general atomic instruction:
396 *
397 * In MODE_COUNT mode, records the number of TMU writes required and flushes
398 * any outstanding TMU operations the instruction depends on, but it doesn't
399 * emit any actual register writes.
400 *
401 * In MODE_EMIT mode, emits the data register writes required by the
402 * instruction.
403 */
404 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)405 emit_tmu_general_atomic_writes(struct v3d_compile *c,
406 enum emit_mode mode,
407 nir_intrinsic_instr *instr,
408 uint32_t tmu_op,
409 bool has_index,
410 uint32_t *tmu_writes)
411 {
412 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
413
414 struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
415 if (mode == MODE_COUNT)
416 (*tmu_writes)++;
417 else
418 vir_MOV_dest(c, tmud, data);
419
420 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
421 data = ntq_get_src(c, instr->src[2 + has_index], 0);
422 if (mode == MODE_COUNT)
423 (*tmu_writes)++;
424 else
425 vir_MOV_dest(c, tmud, data);
426 }
427 }
428
429 /**
430 * For any TMU general instruction:
431 *
432 * In MODE_COUNT mode, records the number of TMU writes required to emit the
433 * address parameter and flushes any outstanding TMU operations the instruction
434 * depends on, but it doesn't emit any actual register writes.
435 *
436 * In MODE_EMIT mode, emits register writes required to emit the address.
437 */
438 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t dest_components,uint32_t * tmu_writes)439 emit_tmu_general_address_write(struct v3d_compile *c,
440 enum emit_mode mode,
441 nir_intrinsic_instr *instr,
442 uint32_t config,
443 bool dynamic_src,
444 int offset_src,
445 struct qreg base_offset,
446 uint32_t const_offset,
447 uint32_t dest_components,
448 uint32_t *tmu_writes)
449 {
450 if (mode == MODE_COUNT) {
451 (*tmu_writes)++;
452 if (dynamic_src)
453 ntq_get_src(c, instr->src[offset_src], 0);
454 return;
455 }
456
457 if (vir_in_nonuniform_control_flow(c)) {
458 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
459 V3D_QPU_PF_PUSHZ);
460 }
461
462 struct qreg tmua;
463 if (config == ~0)
464 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
465 else
466 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
467
468 struct qinst *tmu;
469 if (dynamic_src) {
470 struct qreg offset = base_offset;
471 if (const_offset != 0) {
472 offset = vir_ADD(c, offset,
473 vir_uniform_ui(c, const_offset));
474 }
475 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
476 tmu = vir_ADD_dest(c, tmua, offset, data);
477 } else {
478 if (const_offset != 0) {
479 tmu = vir_ADD_dest(c, tmua, base_offset,
480 vir_uniform_ui(c, const_offset));
481 } else {
482 tmu = vir_MOV_dest(c, tmua, base_offset);
483 }
484 }
485
486 if (config != ~0) {
487 tmu->uniform =
488 vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
489 }
490
491 if (vir_in_nonuniform_control_flow(c))
492 vir_set_cond(tmu, V3D_QPU_COND_IFA);
493
494 tmu->ldtmu_count = dest_components;
495 }
496
497 /**
498 * Implements indirect uniform loads and SSBO accesses through the TMU general
499 * memory access interface.
500 */
501 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch,bool is_global)502 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
503 bool is_shared_or_scratch, bool is_global)
504 {
505 uint32_t tmu_op = v3d_general_tmu_op(instr);
506
507 /* If we were able to replace atomic_add for an inc/dec, then we
508 * need/can to do things slightly different, like not loading the
509 * amount to add/sub, as that is implicit.
510 */
511 bool atomic_add_replaced =
512 (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
513 instr->intrinsic == nir_intrinsic_shared_atomic ||
514 instr->intrinsic == nir_intrinsic_global_atomic) &&
515 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
516 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
517 tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
518
519 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
520 instr->intrinsic == nir_intrinsic_store_scratch ||
521 instr->intrinsic == nir_intrinsic_store_shared ||
522 instr->intrinsic == nir_intrinsic_store_global);
523
524 bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
525 instr->intrinsic == nir_intrinsic_load_ubo ||
526 instr->intrinsic == nir_intrinsic_load_ssbo ||
527 instr->intrinsic == nir_intrinsic_load_scratch ||
528 instr->intrinsic == nir_intrinsic_load_shared ||
529 instr->intrinsic == nir_intrinsic_load_global ||
530 instr->intrinsic == nir_intrinsic_load_global_constant);
531
532 if (!is_load)
533 c->tmu_dirty_rcl = true;
534
535 if (is_global)
536 c->has_global_address = true;
537
538 bool has_index = !is_shared_or_scratch && !is_global;
539
540 int offset_src;
541 if (instr->intrinsic == nir_intrinsic_load_uniform) {
542 offset_src = 0;
543 } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
544 instr->intrinsic == nir_intrinsic_load_ubo ||
545 instr->intrinsic == nir_intrinsic_load_scratch ||
546 instr->intrinsic == nir_intrinsic_load_shared ||
547 instr->intrinsic == nir_intrinsic_load_global ||
548 instr->intrinsic == nir_intrinsic_load_global_constant ||
549 atomic_add_replaced) {
550 offset_src = 0 + has_index;
551 } else if (is_store) {
552 offset_src = 1 + has_index;
553 } else {
554 offset_src = 0 + has_index;
555 }
556
557 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
558 uint32_t const_offset = 0;
559 if (!dynamic_src)
560 const_offset = nir_src_as_uint(instr->src[offset_src]);
561
562 struct qreg base_offset;
563 if (instr->intrinsic == nir_intrinsic_load_uniform) {
564 const_offset += nir_intrinsic_base(instr);
565 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
566 v3d_unit_data_create(0, const_offset));
567 const_offset = 0;
568 } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
569 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
570 * is gallium's constant buffer 0 in GL and push constants
571 * in Vulkan)).
572 */
573 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
574 base_offset =
575 vir_uniform(c, QUNIFORM_UBO_ADDR,
576 v3d_unit_data_create(index, const_offset));
577 const_offset = 0;
578 } else if (is_shared_or_scratch) {
579 /* Shared and scratch variables have no buffer index, and all
580 * start from a common base that we set up at the start of
581 * dispatch.
582 */
583 if (instr->intrinsic == nir_intrinsic_load_scratch ||
584 instr->intrinsic == nir_intrinsic_store_scratch) {
585 base_offset = c->spill_base;
586 } else {
587 base_offset = c->cs_shared_offset;
588 const_offset += nir_intrinsic_base(instr);
589 }
590 } else if (is_global) {
591 /* Global load/store intrinsics use gloal addresses, so the
592 * offset is the target address and we don't need to add it
593 * to a base offset.
594 */
595 base_offset = vir_uniform_ui(c, 0);
596 } else {
597 uint32_t idx = is_store ? 1 : 0;
598 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
599 nir_src_comp_as_uint(instr->src[idx], 0));
600 }
601
602 /* We are ready to emit TMU register writes now, but before we actually
603 * emit them we need to flush outstanding TMU operations if any of our
604 * writes reads from the result of an outstanding TMU operation before
605 * we start the TMU sequence for this operation, since otherwise the
606 * flush could happen in the middle of the TMU sequence we are about to
607 * emit, which is illegal. To do this we run this logic twice, the
608 * first time it will count required register writes and flush pending
609 * TMU requests if necessary due to a dependency, and the second one
610 * will emit the actual TMU writes.
611 */
612 const uint32_t dest_components = nir_intrinsic_dest_components(instr);
613 uint32_t base_const_offset = const_offset;
614 uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
615 uint32_t tmu_writes = 0;
616 for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
617 assert(mode == MODE_COUNT || tmu_writes > 0);
618
619 uint32_t type_size = 4;
620
621 if (is_store) {
622 emit_tmu_general_store_writes(c, mode, instr,
623 base_const_offset,
624 &writemask,
625 &const_offset,
626 &type_size,
627 &tmu_writes);
628 } else if (!is_load && !atomic_add_replaced) {
629 emit_tmu_general_atomic_writes(c, mode, instr,
630 tmu_op, has_index,
631 &tmu_writes);
632 } else if (is_load) {
633 type_size = instr->def.bit_size / 8;
634 }
635
636 /* For atomics we use 32bit except for CMPXCHG, that we need
637 * to use VEC2. For the rest of the cases we use the number of
638 * tmud writes we did to decide the type. For cache operations
639 * the type is ignored.
640 */
641 uint32_t config = 0;
642 if (mode == MODE_EMIT) {
643 uint32_t num_components;
644 if (is_load || atomic_add_replaced) {
645 num_components = instr->num_components;
646 } else {
647 assert(tmu_writes > 0);
648 num_components = tmu_writes - 1;
649 }
650 bool is_atomic =
651 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
652 V3D_TMU_OP_TYPE_ATOMIC;
653
654 /* Only load per-quad if we can be certain that all
655 * lines in the quad are active. Notice that demoted
656 * invocations, unlike terminated ones, are still
657 * active: we want to skip memory writes for them but
658 * loads should still work.
659 */
660 uint32_t perquad =
661 is_load && !vir_in_nonuniform_control_flow(c) &&
662 ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
663 c->s->info.fs.needs_quad_helper_invocations &&
664 !c->emitted_discard) ||
665 c->s->info.uses_wide_subgroup_intrinsics) ?
666 GENERAL_TMU_LOOKUP_PER_QUAD :
667 GENERAL_TMU_LOOKUP_PER_PIXEL;
668 config = 0xffffff00 | tmu_op << 3 | perquad;
669
670 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
671 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
672 } else if (is_atomic || num_components == 1) {
673 switch (type_size) {
674 case 4:
675 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
676 break;
677 case 2:
678 config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
679 break;
680 case 1:
681 config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
682 break;
683 default:
684 unreachable("Unsupported bitsize");
685 }
686 } else {
687 assert(type_size == 4);
688 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
689 num_components - 2;
690 }
691 }
692
693 emit_tmu_general_address_write(c, mode, instr, config,
694 dynamic_src, offset_src,
695 base_offset, const_offset,
696 dest_components, &tmu_writes);
697
698 assert(tmu_writes > 0);
699 if (mode == MODE_COUNT) {
700 /* Make sure we won't exceed the 16-entry TMU
701 * fifo if each thread is storing at the same
702 * time.
703 */
704 while (tmu_writes > 16 / c->threads)
705 c->threads /= 2;
706
707 /* If pipelining this TMU operation would
708 * overflow TMU fifos, we need to flush.
709 */
710 if (ntq_tmu_fifo_overflow(c, dest_components))
711 ntq_flush_tmu(c);
712 } else {
713 /* Delay emission of the thread switch and
714 * LDTMU/TMUWT until we really need to do it to
715 * improve pipelining.
716 */
717 const uint32_t component_mask =
718 (1 << dest_components) - 1;
719 ntq_add_pending_tmu_flush(c, &instr->def,
720 component_mask);
721 }
722 }
723
724 /* nir_lower_wrmasks should've ensured that any writemask on a store
725 * operation only has consecutive bits set, in which case we should've
726 * processed the full writemask above.
727 */
728 assert(writemask == 0);
729 }
730
731 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_def * def)732 ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
733 {
734 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
735 def->num_components);
736 _mesa_hash_table_insert(c->def_ht, def, qregs);
737 return qregs;
738 }
739
740 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)741 is_ld_signal(const struct v3d_qpu_sig *sig)
742 {
743 return (sig->ldunif ||
744 sig->ldunifa ||
745 sig->ldunifrf ||
746 sig->ldunifarf ||
747 sig->ldtmu ||
748 sig->ldvary ||
749 sig->ldvpm ||
750 sig->ldtlb ||
751 sig->ldtlbu);
752 }
753
754 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)755 is_ldunif_signal(const struct v3d_qpu_sig *sig)
756 {
757 return sig->ldunif || sig->ldunifrf;
758 }
759
760 /**
761 * This function is responsible for getting VIR results into the associated
762 * storage for a NIR instruction.
763 *
764 * If it's a NIR SSA def, then we just set the associated hash table entry to
765 * the new result.
766 *
767 * If it's a NIR reg, then we need to update the existing qreg assigned to the
768 * NIR destination with the incoming value. To do that without introducing
769 * new MOVs, we require that the incoming qreg either be a uniform, or be
770 * SSA-defined by the previous VIR instruction in the block and rewritable by
771 * this function. That lets us sneak ahead and insert the SF flag beforehand
772 * (knowing that the previous instruction doesn't depend on flags) and rewrite
773 * its destination to be the NIR reg's destination
774 */
775 void
ntq_store_def(struct v3d_compile * c,nir_def * def,int chan,struct qreg result)776 ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
777 struct qreg result)
778 {
779 struct qinst *last_inst = NULL;
780 if (!list_is_empty(&c->cur_block->instructions))
781 last_inst = (struct qinst *)c->cur_block->instructions.prev;
782
783 bool is_reused_uniform =
784 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
785 last_inst != c->defs[result.index];
786
787 assert(result.file == QFILE_TEMP && last_inst &&
788 (last_inst == c->defs[result.index] || is_reused_uniform));
789
790 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
791 if (store == NULL) {
792 assert(chan < def->num_components);
793
794 struct qreg *qregs;
795 struct hash_entry *entry =
796 _mesa_hash_table_search(c->def_ht, def);
797
798 if (entry)
799 qregs = entry->data;
800 else
801 qregs = ntq_init_ssa_def(c, def);
802
803 qregs[chan] = result;
804 } else {
805 nir_def *reg = store->src[1].ssa;
806 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
807 assert(nir_intrinsic_base(store) == 0);
808 assert(nir_intrinsic_num_array_elems(decl) == 0);
809 struct hash_entry *entry =
810 _mesa_hash_table_search(c->def_ht, reg);
811 struct qreg *qregs = entry->data;
812
813 /* If the previous instruction can't be predicated for
814 * the store into the nir_register, then emit a MOV
815 * that can be.
816 */
817 if (is_reused_uniform ||
818 (vir_in_nonuniform_control_flow(c) &&
819 is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
820 result = vir_MOV(c, result);
821 last_inst = c->defs[result.index];
822 }
823
824 /* We know they're both temps, so just rewrite index. */
825 c->defs[last_inst->dst.index] = NULL;
826 last_inst->dst.index = qregs[chan].index;
827
828 /* If we're in control flow, then make this update of the reg
829 * conditional on the execution mask.
830 */
831 if (vir_in_nonuniform_control_flow(c)) {
832 last_inst->dst.index = qregs[chan].index;
833
834 /* Set the flags to the current exec mask.
835 */
836 c->cursor = vir_before_inst(last_inst);
837 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
838 V3D_QPU_PF_PUSHZ);
839 c->cursor = vir_after_inst(last_inst);
840
841 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
842 }
843 }
844 }
845
846 /**
847 * This looks up the qreg associated with a particular ssa/reg used as a source
848 * in any instruction.
849 *
850 * It is expected that the definition for any NIR value read as a source has
851 * been emitted by a previous instruction, however, in the case of TMU
852 * operations we may have postponed emission of the thread switch and LDTMUs
853 * required to read the TMU results until the results are actually used to
854 * improve pipelining, which then would lead to us not finding them here
855 * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
856 * (for registers), meaning that we need to flush outstanding TMU operations
857 * to read the correct value.
858 */
859 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)860 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
861 {
862 struct hash_entry *entry;
863
864 nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
865 if (load == NULL) {
866 assert(i < src.ssa->num_components);
867
868 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
869 if (!entry) {
870 ntq_flush_tmu(c);
871 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
872 }
873 } else {
874 nir_def *reg = load->src[0].ssa;
875 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
876 assert(nir_intrinsic_base(load) == 0);
877 assert(nir_intrinsic_num_array_elems(decl) == 0);
878 assert(i < nir_intrinsic_num_components(decl));
879
880 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
881 ntq_flush_tmu(c);
882 entry = _mesa_hash_table_search(c->def_ht, reg);
883 }
884 assert(entry);
885
886 struct qreg *qregs = entry->data;
887 return qregs[i];
888 }
889
890 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)891 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
892 unsigned src)
893 {
894 struct qreg r = ntq_get_src(c, instr->src[src].src,
895 instr->src[src].swizzle[0]);
896
897 return r;
898 };
899
900 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)901 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
902 {
903 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
904 }
905
906 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)907 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
908 {
909 unsigned unit = instr->texture_index;
910 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
911 int dest_size = nir_tex_instr_dest_size(instr);
912
913 struct qreg lod = c->undef;
914 if (lod_index != -1)
915 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
916
917 for (int i = 0; i < dest_size; i++) {
918 assert(i < 3);
919 enum quniform_contents contents;
920
921 if (instr->is_array && i == dest_size - 1)
922 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
923 else
924 contents = QUNIFORM_TEXTURE_WIDTH + i;
925
926 struct qreg size = vir_uniform(c, contents, unit);
927
928 switch (instr->sampler_dim) {
929 case GLSL_SAMPLER_DIM_1D:
930 case GLSL_SAMPLER_DIM_2D:
931 case GLSL_SAMPLER_DIM_MS:
932 case GLSL_SAMPLER_DIM_3D:
933 case GLSL_SAMPLER_DIM_CUBE:
934 case GLSL_SAMPLER_DIM_BUF:
935 case GLSL_SAMPLER_DIM_EXTERNAL:
936 /* Don't minify the array size. */
937 if (!(instr->is_array && i == dest_size - 1)) {
938 size = ntq_minify(c, size, lod);
939 }
940 break;
941
942 case GLSL_SAMPLER_DIM_RECT:
943 /* There's no LOD field for rects */
944 break;
945
946 default:
947 unreachable("Bad sampler type");
948 }
949
950 ntq_store_def(c, &instr->def, i, size);
951 }
952 }
953
954 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)955 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
956 {
957 unsigned unit = instr->texture_index;
958
959 /* Since each texture sampling op requires uploading uniforms to
960 * reference the texture, there's no HW support for texture size and
961 * you just upload uniforms containing the size.
962 */
963 switch (instr->op) {
964 case nir_texop_query_levels:
965 ntq_store_def(c, &instr->def, 0,
966 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
967 return;
968 case nir_texop_texture_samples:
969 ntq_store_def(c, &instr->def, 0,
970 vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
971 return;
972 case nir_texop_txs:
973 ntq_emit_txs(c, instr);
974 return;
975 default:
976 break;
977 }
978
979 v3d_vir_emit_tex(c, instr);
980 }
981
982 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)983 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
984 {
985 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
986 if (is_cos)
987 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
988
989 struct qreg periods = vir_FROUND(c, input);
990 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
991 return vir_XOR(c, sin_output, vir_SHL(c,
992 vir_FTOIN(c, periods),
993 vir_uniform_ui(c, -1)));
994 }
995
996 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)997 ntq_fsign(struct v3d_compile *c, struct qreg src)
998 {
999 struct qreg t = vir_get_temp(c);
1000
1001 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
1002 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
1003 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
1004 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
1005 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
1006 return vir_MOV(c, t);
1007 }
1008
1009 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)1010 emit_fragcoord_input(struct v3d_compile *c, int attr)
1011 {
1012 c->inputs[attr * 4 + 0] = vir_FXCD(c);
1013 c->inputs[attr * 4 + 1] = vir_FYCD(c);
1014 c->inputs[attr * 4 + 2] = c->payload_z;
1015 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
1016 }
1017
1018 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg c_reg)1019 emit_smooth_varying(struct v3d_compile *c,
1020 struct qreg vary, struct qreg w, struct qreg c_reg)
1021 {
1022 return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
1023 }
1024
1025 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1026 emit_noperspective_varying(struct v3d_compile *c,
1027 struct qreg vary, struct qreg c_reg)
1028 {
1029 return vir_FADD(c, vir_MOV(c, vary), c_reg);
1030 }
1031
1032 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1033 emit_flat_varying(struct v3d_compile *c,
1034 struct qreg vary, struct qreg c_reg)
1035 {
1036 vir_MOV_dest(c, c->undef, vary);
1037 return vir_MOV(c, c_reg);
1038 }
1039
1040 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1041 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1042 int8_t input_idx, uint8_t swizzle, int array_index)
1043 {
1044 struct qreg c_reg; /* C coefficient */
1045
1046 if (c->devinfo->has_accumulators)
1047 c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1048 else
1049 c_reg = vir_reg(QFILE_REG, 0);
1050
1051 struct qinst *ldvary = NULL;
1052 struct qreg vary;
1053 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1054 c->undef, c->undef);
1055 ldvary->qpu.sig.ldvary = true;
1056 vary = vir_emit_def(c, ldvary);
1057
1058 /* Store the input value before interpolation so we can implement
1059 * GLSL's interpolateAt functions if the shader uses them.
1060 */
1061 if (input_idx >= 0) {
1062 assert(var);
1063 c->interp[input_idx].vp = vary;
1064 c->interp[input_idx].C = vir_MOV(c, c_reg);
1065 c->interp[input_idx].mode = var->data.interpolation;
1066 }
1067
1068 /* For gl_PointCoord input or distance along a line, we'll be called
1069 * with no nir_variable, and we don't count toward VPM size so we
1070 * don't track an input slot.
1071 */
1072 if (!var) {
1073 assert(input_idx < 0);
1074 return emit_smooth_varying(c, vary, c->payload_w, c_reg);
1075 }
1076
1077 int i = c->num_inputs++;
1078 c->input_slots[i] =
1079 v3d_slot_from_slot_and_component(var->data.location +
1080 array_index, swizzle);
1081
1082 struct qreg result;
1083 switch (var->data.interpolation) {
1084 case INTERP_MODE_NONE:
1085 case INTERP_MODE_SMOOTH:
1086 if (var->data.centroid) {
1087 BITSET_SET(c->centroid_flags, i);
1088 result = emit_smooth_varying(c, vary,
1089 c->payload_w_centroid, c_reg);
1090 } else {
1091 result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
1092 }
1093 break;
1094
1095 case INTERP_MODE_NOPERSPECTIVE:
1096 BITSET_SET(c->noperspective_flags, i);
1097 result = emit_noperspective_varying(c, vary, c_reg);
1098 break;
1099
1100 case INTERP_MODE_FLAT:
1101 BITSET_SET(c->flat_shade_flags, i);
1102 result = emit_flat_varying(c, vary, c_reg);
1103 break;
1104
1105 default:
1106 unreachable("Bad interp mode");
1107 }
1108
1109 if (input_idx >= 0)
1110 c->inputs[input_idx] = result;
1111 return result;
1112 }
1113
1114 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1115 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1116 int array_index, unsigned nelem)
1117 {
1118 for (int i = 0; i < nelem ; i++) {
1119 int chan = var->data.location_frac + i;
1120 int input_idx = (base_attr + array_index) * 4 + chan;
1121 emit_fragment_varying(c, var, input_idx, chan, array_index);
1122 }
1123 }
1124
1125 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1126 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1127 int array_index)
1128 {
1129 /* Compact variables are scalar arrays where each set of 4 elements
1130 * consumes a single location.
1131 */
1132 int loc_offset = array_index / 4;
1133 int chan = var->data.location_frac + array_index % 4;
1134 int input_idx = (attr + loc_offset) * 4 + chan;
1135 emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1136 }
1137
1138 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1139 add_output(struct v3d_compile *c,
1140 uint32_t decl_offset,
1141 uint8_t slot,
1142 uint8_t swizzle)
1143 {
1144 uint32_t old_array_size = c->outputs_array_size;
1145 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1146 decl_offset + 1);
1147
1148 if (old_array_size != c->outputs_array_size) {
1149 c->output_slots = reralloc(c,
1150 c->output_slots,
1151 struct v3d_varying_slot,
1152 c->outputs_array_size);
1153 }
1154
1155 c->output_slots[decl_offset] =
1156 v3d_slot_from_slot_and_component(slot, swizzle);
1157 }
1158
1159 /**
1160 * If compare_instr is a valid comparison instruction, emits the
1161 * compare_instr's comparison and returns the sel_instr's return value based
1162 * on the compare_instr's result.
1163 */
1164 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1165 ntq_emit_comparison(struct v3d_compile *c,
1166 nir_alu_instr *compare_instr,
1167 enum v3d_qpu_cond *out_cond)
1168 {
1169 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1170 struct qreg src1;
1171 if (nir_op_infos[compare_instr->op].num_inputs > 1)
1172 src1 = ntq_get_alu_src(c, compare_instr, 1);
1173 bool cond_invert = false;
1174 struct qreg nop = vir_nop_reg();
1175
1176 switch (compare_instr->op) {
1177 case nir_op_feq32:
1178 case nir_op_seq:
1179 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1180 break;
1181 case nir_op_ieq32:
1182 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1183 break;
1184
1185 case nir_op_fneu32:
1186 case nir_op_sne:
1187 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1188 cond_invert = true;
1189 break;
1190 case nir_op_ine32:
1191 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1192 cond_invert = true;
1193 break;
1194
1195 case nir_op_fge32:
1196 case nir_op_sge:
1197 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1198 break;
1199 case nir_op_ige32:
1200 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1201 cond_invert = true;
1202 break;
1203 case nir_op_uge32:
1204 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1205 cond_invert = true;
1206 break;
1207
1208 case nir_op_slt:
1209 case nir_op_flt32:
1210 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1211 break;
1212 case nir_op_ilt32:
1213 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1214 break;
1215 case nir_op_ult32:
1216 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1217 break;
1218
1219 default:
1220 return false;
1221 }
1222
1223 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1224
1225 return true;
1226 }
1227
1228 /* Finds an ALU instruction that generates our src value that could
1229 * (potentially) be greedily emitted in the consuming instruction.
1230 */
1231 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1232 ntq_get_alu_parent(nir_src src)
1233 {
1234 if (src.ssa->parent_instr->type != nir_instr_type_alu)
1235 return NULL;
1236 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1237 if (!instr)
1238 return NULL;
1239
1240 /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1241 * moving emission of the ALU instr down past another write of the
1242 * src.
1243 */
1244 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1245 if (nir_load_reg_for_def(instr->src[i].src.ssa))
1246 return NULL;
1247 }
1248
1249 return instr;
1250 }
1251
1252 /* Turns a NIR bool into a condition code to predicate on. */
1253 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1254 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1255 {
1256 struct qreg qsrc = ntq_get_src(c, src, 0);
1257 /* skip if we already have src in the flags */
1258 if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1259 return c->flags_cond;
1260
1261 nir_alu_instr *compare = ntq_get_alu_parent(src);
1262 if (!compare)
1263 goto out;
1264
1265 enum v3d_qpu_cond cond;
1266 if (ntq_emit_comparison(c, compare, &cond))
1267 return cond;
1268
1269 out:
1270
1271 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1272 V3D_QPU_PF_PUSHZ);
1273 return V3D_QPU_COND_IFNA;
1274 }
1275
1276 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1277 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1278 {
1279 struct qreg result =
1280 vir_MOV(c, vir_SEL(c, cond,
1281 vir_uniform_ui(c, ~0),
1282 vir_uniform_ui(c, 0)));
1283 c->flags_temp = result.index;
1284 c->flags_cond = cond;
1285 return result;
1286 }
1287
1288 static struct qreg
ntq_emit_cond_to_int(struct v3d_compile * c,enum v3d_qpu_cond cond)1289 ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
1290 {
1291 struct qreg result =
1292 vir_MOV(c, vir_SEL(c, cond,
1293 vir_uniform_ui(c, 1),
1294 vir_uniform_ui(c, 0)));
1295 c->flags_temp = result.index;
1296 c->flags_cond = cond;
1297 return result;
1298 }
1299
1300 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1301 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1302 {
1303 /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1304 * method and seems to be using RTE by default, so we need to implement
1305 * RTZ rounding in software.
1306 */
1307 struct qreg rf16 = vir_FMOV(c, f32);
1308 vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
1309
1310 struct qreg rf32 = vir_FMOV(c, rf16);
1311 vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
1312
1313 struct qreg f32_abs = vir_FMOV(c, f32);
1314 vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1315
1316 struct qreg rf32_abs = vir_FMOV(c, rf32);
1317 vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1318
1319 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
1320 V3D_QPU_PF_PUSHN);
1321 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1322 vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
1323 }
1324
1325 /**
1326 * Takes the result value of a signed integer width conversion from a smaller
1327 * type to a larger type and if needed, it applies sign extension to it.
1328 */
1329 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1330 sign_extend(struct v3d_compile *c,
1331 struct qreg value,
1332 uint32_t src_bit_size,
1333 uint32_t dst_bit_size)
1334 {
1335 assert(src_bit_size < dst_bit_size);
1336
1337 struct qreg tmp = vir_MOV(c, value);
1338
1339 /* Do we need to sign-extend? */
1340 uint32_t sign_mask = 1 << (src_bit_size - 1);
1341 struct qinst *sign_check =
1342 vir_AND_dest(c, vir_nop_reg(),
1343 tmp, vir_uniform_ui(c, sign_mask));
1344 vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1345
1346 /* If so, fill in leading sign bits */
1347 uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1348 ((1ull << dst_bit_size) - 1);
1349 struct qinst *extend_inst =
1350 vir_OR_dest(c, tmp, tmp,
1351 vir_uniform_ui(c, extend_bits));
1352 vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1353
1354 return tmp;
1355 }
1356
1357 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1358 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1359 {
1360 /* Vectors are special in that they have non-scalarized writemasks,
1361 * and just take the first swizzle channel for each argument in order
1362 * into each writemask channel.
1363 */
1364 if (instr->op == nir_op_vec2 ||
1365 instr->op == nir_op_vec3 ||
1366 instr->op == nir_op_vec4) {
1367 struct qreg srcs[4];
1368 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1369 srcs[i] = ntq_get_src(c, instr->src[i].src,
1370 instr->src[i].swizzle[0]);
1371 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1372 ntq_store_def(c, &instr->def, i,
1373 vir_MOV(c, srcs[i]));
1374 return;
1375 }
1376
1377 /* General case: We can just grab the one used channel per src. */
1378 struct qreg src[nir_op_infos[instr->op].num_inputs];
1379 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1380 src[i] = ntq_get_alu_src(c, instr, i);
1381 }
1382
1383 struct qreg result;
1384
1385 switch (instr->op) {
1386 case nir_op_mov:
1387 result = vir_MOV(c, src[0]);
1388 break;
1389
1390 case nir_op_fneg:
1391 result = vir_XOR(c, src[0], vir_uniform_ui(c, UINT32_C(1) << 31));
1392 break;
1393 case nir_op_ineg:
1394 result = vir_NEG(c, src[0]);
1395 break;
1396
1397 case nir_op_fmul:
1398 result = vir_FMUL(c, src[0], src[1]);
1399 break;
1400 case nir_op_fadd:
1401 result = vir_FADD(c, src[0], src[1]);
1402 break;
1403 case nir_op_fsub:
1404 result = vir_FSUB(c, src[0], src[1]);
1405 break;
1406 case nir_op_fmin:
1407 result = vir_FMIN(c, src[0], src[1]);
1408 break;
1409 case nir_op_fmax:
1410 result = vir_FMAX(c, src[0], src[1]);
1411 break;
1412
1413 case nir_op_f2i32: {
1414 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1415 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1416 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1417 } else {
1418 result = vir_FTOIZ(c, src[0]);
1419 }
1420 if (nir_src_bit_size(instr->src[0].src) == 16)
1421 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1422 break;
1423 }
1424
1425 case nir_op_f2u32:
1426 result = vir_FTOUZ(c, src[0]);
1427 if (nir_src_bit_size(instr->src[0].src) == 16)
1428 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1429 break;
1430 case nir_op_i2f32: {
1431 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1432 assert(bit_size <= 32);
1433 result = src[0];
1434 if (bit_size < 32) {
1435 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1436 result = vir_AND(c, result, vir_uniform_ui(c, mask));
1437 result = sign_extend(c, result, bit_size, 32);
1438 }
1439 result = vir_ITOF(c, result);
1440 break;
1441 }
1442 case nir_op_u2f32: {
1443 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1444 assert(bit_size <= 32);
1445 result = src[0];
1446 if (bit_size < 32) {
1447 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1448 result = vir_AND(c, result, vir_uniform_ui(c, mask));
1449 }
1450 result = vir_UTOF(c, result);
1451 break;
1452 }
1453 case nir_op_b2f16:
1454 result = vir_AND(c, src[0], vir_uniform_ui(c, 0x3c00));
1455 break;
1456 case nir_op_b2f32:
1457 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1458 break;
1459 case nir_op_b2i8:
1460 case nir_op_b2i16:
1461 case nir_op_b2i32:
1462 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1463 break;
1464
1465 case nir_op_i2f16: {
1466 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1467 assert(bit_size <= 32);
1468 if (bit_size < 32) {
1469 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1470 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1471 result = sign_extend(c, result, bit_size, 32);
1472 } else {
1473 result = src[0];
1474 }
1475 result = vir_ITOF(c, result);
1476 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1477 break;
1478 }
1479
1480 case nir_op_u2f16: {
1481 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1482 assert(bit_size <= 32);
1483 if (bit_size < 32) {
1484 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1485 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1486 } else {
1487 result = src[0];
1488 }
1489 result = vir_UTOF(c, result);
1490 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1491 break;
1492 }
1493
1494 case nir_op_f2f16:
1495 case nir_op_f2f16_rtne:
1496 assert(nir_src_bit_size(instr->src[0].src) == 32);
1497 result = vir_FMOV(c, src[0]);
1498 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1499 break;
1500
1501 case nir_op_f2f16_rtz:
1502 assert(nir_src_bit_size(instr->src[0].src) == 32);
1503 result = f2f16_rtz(c, src[0]);
1504 break;
1505
1506 case nir_op_f2f32:
1507 assert(nir_src_bit_size(instr->src[0].src) == 16);
1508 result = vir_FMOV(c, src[0]);
1509 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1510 break;
1511
1512 case nir_op_i2i16: {
1513 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1514 assert(bit_size == 32 || bit_size == 8);
1515 if (bit_size == 32) {
1516 /* We don't have integer pack/unpack methods for
1517 * converting between 16-bit and 32-bit, so we implement
1518 * the conversion manually by truncating the src.
1519 */
1520 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1521 } else {
1522 struct qreg tmp = vir_AND(c, src[0],
1523 vir_uniform_ui(c, 0xff));
1524 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1525 }
1526 break;
1527 }
1528
1529 case nir_op_u2u16: {
1530 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1531 assert(bit_size == 32 || bit_size == 8);
1532
1533 /* We don't have integer pack/unpack methods for converting
1534 * between 16-bit and 32-bit, so we implement the conversion
1535 * manually by truncating the src. For the 8-bit case, we
1536 * want to make sure we don't copy garbage from any of the
1537 * 24 MSB bits.
1538 */
1539 if (bit_size == 32)
1540 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1541 else
1542 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1543 break;
1544 }
1545
1546 case nir_op_i2i8:
1547 case nir_op_u2u8:
1548 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1549 nir_src_bit_size(instr->src[0].src) == 16);
1550 /* We don't have integer pack/unpack methods for converting
1551 * between 8-bit and 32-bit, so we implement the conversion
1552 * manually by truncating the src.
1553 */
1554 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1555 break;
1556
1557 case nir_op_u2u32: {
1558 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1559 assert(bit_size == 16 || bit_size == 8);
1560
1561 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1562 * from the src but we make sure to clear any garbage bits that
1563 * may be present in the invalid src bits.
1564 */
1565 uint32_t mask = (1 << bit_size) - 1;
1566 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1567 break;
1568 }
1569
1570 case nir_op_i2i32: {
1571 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1572 assert(bit_size == 16 || bit_size == 8);
1573
1574 uint32_t mask = (1 << bit_size) - 1;
1575 struct qreg tmp = vir_AND(c, src[0],
1576 vir_uniform_ui(c, mask));
1577
1578 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1579 break;
1580 }
1581
1582 case nir_op_iadd:
1583 result = vir_ADD(c, src[0], src[1]);
1584 break;
1585 case nir_op_ushr:
1586 result = vir_SHR(c, src[0], src[1]);
1587 break;
1588 case nir_op_isub:
1589 result = vir_SUB(c, src[0], src[1]);
1590 break;
1591 case nir_op_ishr:
1592 result = vir_ASR(c, src[0], src[1]);
1593 break;
1594 case nir_op_ishl:
1595 result = vir_SHL(c, src[0], src[1]);
1596 break;
1597 case nir_op_imin:
1598 result = vir_MIN(c, src[0], src[1]);
1599 break;
1600 case nir_op_umin:
1601 result = vir_UMIN(c, src[0], src[1]);
1602 break;
1603 case nir_op_imax:
1604 result = vir_MAX(c, src[0], src[1]);
1605 break;
1606 case nir_op_umax:
1607 result = vir_UMAX(c, src[0], src[1]);
1608 break;
1609 case nir_op_iand:
1610 result = vir_AND(c, src[0], src[1]);
1611 break;
1612 case nir_op_ior:
1613 result = vir_OR(c, src[0], src[1]);
1614 break;
1615 case nir_op_ixor:
1616 result = vir_XOR(c, src[0], src[1]);
1617 break;
1618 case nir_op_inot:
1619 result = vir_NOT(c, src[0]);
1620 break;
1621
1622 case nir_op_uclz:
1623 result = vir_CLZ(c, src[0]);
1624 break;
1625
1626 case nir_op_imul:
1627 result = vir_UMUL(c, src[0], src[1]);
1628 break;
1629
1630 case nir_op_seq:
1631 case nir_op_sne:
1632 case nir_op_sge:
1633 case nir_op_slt: {
1634 enum v3d_qpu_cond cond;
1635 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1636 assert(ok);
1637 result = vir_MOV(c, vir_SEL(c, cond,
1638 vir_uniform_f(c, 1.0),
1639 vir_uniform_f(c, 0.0)));
1640 c->flags_temp = result.index;
1641 c->flags_cond = cond;
1642 break;
1643 }
1644
1645 case nir_op_feq32:
1646 case nir_op_fneu32:
1647 case nir_op_fge32:
1648 case nir_op_flt32:
1649 case nir_op_ieq32:
1650 case nir_op_ine32:
1651 case nir_op_ige32:
1652 case nir_op_uge32:
1653 case nir_op_ilt32:
1654 case nir_op_ult32: {
1655 enum v3d_qpu_cond cond;
1656 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1657 assert(ok);
1658 result = ntq_emit_cond_to_bool(c, cond);
1659 break;
1660 }
1661
1662 case nir_op_b32csel:
1663 result = vir_MOV(c,
1664 vir_SEL(c,
1665 ntq_emit_bool_to_cond(c, instr->src[0].src),
1666 src[1], src[2]));
1667 break;
1668
1669 case nir_op_fcsel:
1670 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1671 V3D_QPU_PF_PUSHZ);
1672 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1673 src[1], src[2]));
1674 break;
1675
1676 case nir_op_frcp:
1677 result = vir_RECIP(c, src[0]);
1678 break;
1679 case nir_op_frsq:
1680 result = vir_RSQRT(c, src[0]);
1681 break;
1682 case nir_op_fexp2:
1683 result = vir_EXP(c, src[0]);
1684 break;
1685 case nir_op_flog2:
1686 result = vir_LOG(c, src[0]);
1687 break;
1688
1689 case nir_op_fceil:
1690 result = vir_FCEIL(c, src[0]);
1691 break;
1692 case nir_op_ffloor:
1693 result = vir_FFLOOR(c, src[0]);
1694 break;
1695 case nir_op_fround_even:
1696 result = vir_FROUND(c, src[0]);
1697 break;
1698 case nir_op_ftrunc:
1699 result = vir_FTRUNC(c, src[0]);
1700 break;
1701
1702 case nir_op_fsin:
1703 result = ntq_fsincos(c, src[0], false);
1704 break;
1705 case nir_op_fcos:
1706 result = ntq_fsincos(c, src[0], true);
1707 break;
1708
1709 case nir_op_fsign:
1710 result = ntq_fsign(c, src[0]);
1711 break;
1712
1713 case nir_op_fabs: {
1714 result = vir_FMOV(c, src[0]);
1715 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1716 break;
1717 }
1718
1719 case nir_op_iabs:
1720 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1721 break;
1722
1723 case nir_op_uadd_carry:
1724 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1725 V3D_QPU_PF_PUSHC);
1726 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1727 break;
1728
1729 case nir_op_usub_borrow:
1730 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
1731 V3D_QPU_PF_PUSHC);
1732 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1733 break;
1734
1735 case nir_op_pack_half_2x16_split:
1736 result = vir_VFPACK(c, src[0], src[1]);
1737 break;
1738
1739 case nir_op_pack_2x32_to_2x16_v3d:
1740 result = vir_VPACK(c, src[0], src[1]);
1741 break;
1742
1743 case nir_op_pack_32_to_r11g11b10_v3d:
1744 result = vir_V11FPACK(c, src[0], src[1]);
1745 break;
1746
1747 case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
1748 result = vir_V10PACK(c, src[0], src[1]);
1749 break;
1750
1751 case nir_op_pack_4x16_to_4x8_v3d:
1752 result = vir_V8PACK(c, src[0], src[1]);
1753 break;
1754
1755 case nir_op_unpack_half_2x16_split_x:
1756 result = vir_FMOV(c, src[0]);
1757 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1758 break;
1759
1760 case nir_op_unpack_half_2x16_split_y:
1761 result = vir_FMOV(c, src[0]);
1762 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1763 break;
1764
1765 case nir_op_pack_2x16_to_unorm_2x8_v3d:
1766 result = vir_VFTOUNORM8(c, src[0]);
1767 break;
1768
1769 case nir_op_pack_2x16_to_snorm_2x8_v3d:
1770 result = vir_VFTOSNORM8(c, src[0]);
1771 break;
1772
1773 case nir_op_pack_2x16_to_unorm_2x10_v3d:
1774 result = vir_VFTOUNORM10LO(c, src[0]);
1775 break;
1776
1777 case nir_op_pack_2x16_to_unorm_10_2_v3d:
1778 result = vir_VFTOUNORM10HI(c, src[0]);
1779 break;
1780
1781 case nir_op_f2unorm_16_v3d:
1782 result = vir_FTOUNORM16(c, src[0]);
1783 break;
1784
1785 case nir_op_f2snorm_16_v3d:
1786 result = vir_FTOSNORM16(c, src[0]);
1787 break;
1788
1789 case nir_op_fsat:
1790 assert(v3d_device_has_unpack_sat(c->devinfo));
1791 result = vir_FMOV(c, src[0]);
1792 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_SAT);
1793 break;
1794
1795 case nir_op_fsat_signed:
1796 assert(v3d_device_has_unpack_sat(c->devinfo));
1797 result = vir_FMOV(c, src[0]);
1798 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_NSAT);
1799 break;
1800
1801 case nir_op_fclamp_pos:
1802 assert(v3d_device_has_unpack_max0(c->devinfo));
1803 result = vir_FMOV(c, src[0]);
1804 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_MAX0);
1805 break;
1806
1807 default:
1808 fprintf(stderr, "unknown NIR ALU inst: ");
1809 nir_print_instr(&instr->instr, stderr);
1810 fprintf(stderr, "\n");
1811 abort();
1812 }
1813
1814 ntq_store_def(c, &instr->def, 0, result);
1815 }
1816
1817 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1818 * specifier. They come from a register that's preloaded with 0xffffffff
1819 * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
1820 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1821 */
1822 #define TLB_TYPE_F16_COLOR (3 << 6)
1823 #define TLB_TYPE_I32_COLOR (1 << 6)
1824 #define TLB_TYPE_F32_COLOR (0 << 6)
1825 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1826 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1827 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1828 #define TLB_F16_SWAP_HI_LO (1 << 1)
1829 #define TLB_VEC_SIZE_4_F16 (1 << 0)
1830 #define TLB_VEC_SIZE_2_F16 (0 << 0)
1831 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1832
1833 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1834 * flag is set.
1835 */
1836 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1837 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1838 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1839 #define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */
1840 #define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */
1841
1842 /* Stencil is a single 32-bit write. */
1843 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1844
1845 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1846 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1847 {
1848 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1849 return;
1850
1851 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1852 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1853
1854 nir_variable *var = c->output_color_var[rt];
1855 int num_components = glsl_get_vector_elements(var->type);
1856 uint32_t conf = 0xffffff00;
1857 struct qinst *inst;
1858
1859 conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1860 TLB_SAMPLE_MODE_PER_PIXEL;
1861 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1862
1863 if (c->fs_key->swap_color_rb & (1 << rt))
1864 num_components = MAX2(num_components, 3);
1865 assert(num_components != 0);
1866
1867 enum glsl_base_type type = glsl_get_base_type(var->type);
1868 bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1869 bool is_32b_tlb_format = is_int_format ||
1870 (c->fs_key->f32_color_rb & (1 << rt));
1871
1872 if (is_int_format) {
1873 /* The F32 vs I32 distinction was dropped in 4.2. */
1874 if (c->devinfo->ver < 42)
1875 conf |= TLB_TYPE_I32_COLOR;
1876 else
1877 conf |= TLB_TYPE_F32_COLOR;
1878 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1879 } else {
1880 if (c->fs_key->f32_color_rb & (1 << rt)) {
1881 conf |= TLB_TYPE_F32_COLOR;
1882 conf |= ((num_components - 1) <<
1883 TLB_VEC_SIZE_MINUS_1_SHIFT);
1884 } else {
1885 conf |= TLB_TYPE_F16_COLOR;
1886 conf |= TLB_F16_SWAP_HI_LO;
1887 if (num_components >= 3)
1888 conf |= TLB_VEC_SIZE_4_F16;
1889 else
1890 conf |= TLB_VEC_SIZE_2_F16;
1891 }
1892 }
1893
1894 int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1895 for (int i = 0; i < num_samples; i++) {
1896 struct qreg *color = c->msaa_per_sample_output ?
1897 &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1898 &c->outputs[var->data.driver_location * 4];
1899
1900 struct qreg r = color[0];
1901 struct qreg g = color[1];
1902 struct qreg b = color[2];
1903 struct qreg a = color[3];
1904
1905 if (c->fs_key->swap_color_rb & (1 << rt)) {
1906 r = color[2];
1907 b = color[0];
1908 }
1909
1910 if (c->fs_key->sample_alpha_to_one)
1911 a = vir_uniform_f(c, 1.0);
1912
1913 if (is_32b_tlb_format) {
1914 if (i == 0) {
1915 inst = vir_MOV_dest(c, tlbu_reg, r);
1916 inst->uniform =
1917 vir_get_uniform_index(c,
1918 QUNIFORM_CONSTANT,
1919 conf);
1920 } else {
1921 vir_MOV_dest(c, tlb_reg, r);
1922 }
1923
1924 if (num_components >= 2)
1925 vir_MOV_dest(c, tlb_reg, g);
1926 if (num_components >= 3)
1927 vir_MOV_dest(c, tlb_reg, b);
1928 if (num_components >= 4)
1929 vir_MOV_dest(c, tlb_reg, a);
1930 } else {
1931 inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1932 if (conf != ~0 && i == 0) {
1933 inst->dst = tlbu_reg;
1934 inst->uniform =
1935 vir_get_uniform_index(c,
1936 QUNIFORM_CONSTANT,
1937 conf);
1938 }
1939
1940 if (num_components >= 3)
1941 vir_VFPACK_dest(c, tlb_reg, b, a);
1942 }
1943 }
1944 }
1945
1946 static void
emit_frag_end(struct v3d_compile * c)1947 emit_frag_end(struct v3d_compile *c)
1948 {
1949 if (c->output_sample_mask_index != -1) {
1950 vir_SETMSF_dest(c, vir_nop_reg(),
1951 vir_AND(c,
1952 vir_MSF(c),
1953 c->outputs[c->output_sample_mask_index]));
1954 }
1955
1956 bool has_any_tlb_color_write = false;
1957 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1958 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1959 has_any_tlb_color_write = true;
1960 }
1961
1962 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1963 struct nir_variable *var = c->output_color_var[0];
1964 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1965
1966 vir_SETMSF_dest(c, vir_nop_reg(),
1967 vir_AND(c,
1968 vir_MSF(c),
1969 vir_FTOC(c, color[3])));
1970 }
1971
1972 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1973
1974 /* If the shader has no non-TLB side effects and doesn't write Z
1975 * we can promote it to enabling early_fragment_tests even
1976 * if the user didn't.
1977 */
1978 if (c->output_position_index == -1 &&
1979 !(c->s->info.num_images || c->s->info.num_ssbos) &&
1980 !c->fs_key->sample_alpha_to_coverage &&
1981 c->output_sample_mask_index == -1 &&
1982 has_any_tlb_color_write) {
1983 c->s->info.fs.early_fragment_tests =
1984 !c->s->info.fs.uses_discard ||
1985 c->fs_key->can_earlyz_with_discard;
1986 }
1987
1988 /* By default, Z buffer writes are implicit using the Z values produced
1989 * from FEP (Z value produced from rasterization). When this is not
1990 * desirable (shader writes Z explicitly, has discards, etc) we need
1991 * to let the hardware know by setting c->writes_z to true, in which
1992 * case we always need to write a Z value from the QPU, even if it is
1993 * just the passthrough Z value produced from FEP.
1994 *
1995 * Also, from the V3D 4.2 spec:
1996 *
1997 * "If a shader performs a Z read the “Fragment shader does Z writes”
1998 * bit in the shader record must be enabled to ensure deterministic
1999 * results"
2000 *
2001 * So if c->reads_z is set we always need to write Z, even if it is
2002 * a passthrough from the Z value produced from FEP.
2003 */
2004 if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
2005 c->writes_z = true;
2006 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
2007 struct qinst *inst;
2008
2009 if (c->output_position_index != -1) {
2010 /* Shader writes to gl_FragDepth, use that */
2011 inst = vir_MOV_dest(c, tlbu_reg,
2012 c->outputs[c->output_position_index]);
2013
2014 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
2015 TLB_SAMPLE_MODE_PER_PIXEL);
2016 } else {
2017 /* Shader doesn't write to gl_FragDepth, take Z from
2018 * FEP.
2019 */
2020 c->writes_z_from_fep = true;
2021 inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
2022
2023 /* The spec says the PER_PIXEL flag is ignored for
2024 * invariant writes, but the simulator demands it.
2025 */
2026 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
2027 TLB_SAMPLE_MODE_PER_PIXEL);
2028
2029 /* Since (single-threaded) fragment shaders always need
2030 * a TLB write, if we dond't have any we emit a
2031 * passthrouh Z and flag us as potentially discarding,
2032 * so that we can use Z as the required TLB write.
2033 */
2034 if (!has_any_tlb_color_write)
2035 c->s->info.fs.uses_discard = true;
2036 }
2037
2038 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
2039 tlb_specifier |
2040 0xffffff00);
2041 inst->is_tlb_z_write = true;
2042 }
2043
2044 /* XXX: Performance improvement: Merge Z write and color writes TLB
2045 * uniform setup
2046 */
2047 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
2048 vir_emit_tlb_color_write(c, rt);
2049 }
2050
2051 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)2052 vir_VPM_WRITE_indirect(struct v3d_compile *c,
2053 struct qreg val,
2054 struct qreg vpm_index,
2055 bool uniform_vpm_index)
2056 {
2057 if (uniform_vpm_index)
2058 vir_STVPMV(c, vpm_index, val);
2059 else
2060 vir_STVPMD(c, vpm_index, val);
2061 }
2062
2063 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)2064 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
2065 {
2066 vir_VPM_WRITE_indirect(c, val,
2067 vir_uniform_ui(c, vpm_index), true);
2068 }
2069
2070 static void
emit_vert_end(struct v3d_compile * c)2071 emit_vert_end(struct v3d_compile *c)
2072 {
2073 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2074 */
2075 if (c->devinfo->ver == 42)
2076 vir_VPMWT(c);
2077 }
2078
2079 static void
emit_geom_end(struct v3d_compile * c)2080 emit_geom_end(struct v3d_compile *c)
2081 {
2082 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2083 */
2084 if (c->devinfo->ver == 42)
2085 vir_VPMWT(c);
2086 }
2087
2088 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2089 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2090 unsigned bit_size,
2091 unsigned num_components,
2092 int64_t hole_size,
2093 nir_intrinsic_instr *low,
2094 nir_intrinsic_instr *high,
2095 void *data)
2096 {
2097 if (hole_size > 0 || !nir_num_components_valid(num_components))
2098 return false;
2099
2100 /* TMU general access only supports 32-bit vectors */
2101 if (bit_size > 32)
2102 return false;
2103
2104 if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2105 return false;
2106
2107 if (align_mul % 4 != 0 || align_offset % 4 != 0)
2108 return false;
2109
2110 /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2111 * if the resulting vector crosses a 16-byte boundary.
2112 */
2113 assert(util_is_power_of_two_nonzero(align_mul));
2114 align_mul = MIN2(align_mul, 16);
2115 align_offset &= 0xf;
2116 if (16 - align_mul + align_offset + num_components * 4 > 16)
2117 return false;
2118
2119 return true;
2120 }
2121
2122 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2123 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2124 {
2125 bool progress;
2126 unsigned lower_flrp =
2127 (s->options->lower_flrp16 ? 16 : 0) |
2128 (s->options->lower_flrp32 ? 32 : 0) |
2129 (s->options->lower_flrp64 ? 64 : 0);
2130
2131 do {
2132 progress = false;
2133
2134 NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
2135 NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
2136 NIR_PASS(progress, s, nir_opt_deref);
2137
2138 NIR_PASS(progress, s, nir_lower_vars_to_ssa);
2139 if (!s->info.var_copies_lowered) {
2140 /* Only run this pass if nir_lower_var_copies was not called
2141 * yet. That would lower away any copy_deref instructions and we
2142 * don't want to introduce any more.
2143 */
2144 NIR_PASS(progress, s, nir_opt_find_array_copies);
2145 }
2146
2147 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
2148 NIR_PASS(progress, s, nir_opt_dead_write_vars);
2149 NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
2150
2151 NIR_PASS(progress, s, nir_remove_dead_variables,
2152 (nir_variable_mode)(nir_var_function_temp |
2153 nir_var_shader_temp |
2154 nir_var_mem_shared),
2155 NULL);
2156
2157 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2158 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2159 NIR_PASS(progress, s, nir_copy_prop);
2160 NIR_PASS(progress, s, nir_opt_remove_phis);
2161 NIR_PASS(progress, s, nir_opt_dce);
2162 NIR_PASS(progress, s, nir_opt_dead_cf);
2163 NIR_PASS(progress, s, nir_opt_cse);
2164 /* before peephole_select as it can generate 64 bit bcsels */
2165 NIR_PASS(progress, s, nir_lower_64bit_phis);
2166 NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
2167 NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
2168 NIR_PASS(progress, s, nir_opt_algebraic);
2169 NIR_PASS(progress, s, nir_opt_constant_folding);
2170
2171 NIR_PASS(progress, s, nir_opt_intrinsics);
2172 NIR_PASS(progress, s, nir_opt_idiv_const, 32);
2173 NIR_PASS(progress, s, nir_lower_alu);
2174
2175 if (nir_opt_loop(s)) {
2176 progress = true;
2177 NIR_PASS(progress, s, nir_copy_prop);
2178 NIR_PASS(progress, s, nir_opt_dce);
2179 }
2180
2181 NIR_PASS(progress, s, nir_opt_conditional_discard);
2182
2183 NIR_PASS(progress, s, nir_opt_remove_phis);
2184 NIR_PASS(progress, s, nir_opt_if, false);
2185 if (c && !c->disable_gcm) {
2186 bool local_progress = false;
2187 NIR_PASS(local_progress, s, nir_opt_gcm, false);
2188 c->gcm_progress |= local_progress;
2189 progress |= local_progress;
2190 }
2191
2192 /* Note that vectorization may undo the load/store scalarization
2193 * pass we run for non 32-bit TMU general load/store by
2194 * converting, for example, 2 consecutive 16-bit loads into a
2195 * single 32-bit load. This is fine (and desirable) as long as
2196 * the resulting 32-bit load meets 32-bit alignment requirements,
2197 * which mem_vectorize_callback() should be enforcing.
2198 */
2199 nir_load_store_vectorize_options vectorize_opts = {
2200 .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2201 nir_var_mem_push_const | nir_var_mem_shared |
2202 nir_var_mem_global,
2203 .callback = mem_vectorize_callback,
2204 .robust_modes = 0,
2205 };
2206 bool vectorize_progress = false;
2207
2208
2209 /* This requires that we have called
2210 * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
2211 * first, which we may not have done yet if we call here too
2212 * early durign NIR pre-processing. We can detect this because
2213 * in that case we won't have a compile object
2214 */
2215 if (c) {
2216 NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2217 &vectorize_opts);
2218 if (vectorize_progress) {
2219 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2220 NIR_PASS(progress, s, nir_lower_pack);
2221 progress = true;
2222 }
2223 }
2224
2225 if (lower_flrp != 0) {
2226 bool lower_flrp_progress = false;
2227
2228 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2229 lower_flrp,
2230 false /* always_precise */);
2231 if (lower_flrp_progress) {
2232 NIR_PASS(progress, s, nir_opt_constant_folding);
2233 progress = true;
2234 }
2235
2236 /* Nothing should rematerialize any flrps, so we only
2237 * need to do this lowering once.
2238 */
2239 lower_flrp = 0;
2240 }
2241
2242 NIR_PASS(progress, s, nir_opt_undef);
2243 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2244
2245 if (c && !c->disable_loop_unrolling &&
2246 s->options->max_unroll_iterations > 0) {
2247 bool local_progress = false;
2248 NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2249 c->unrolled_any_loops |= local_progress;
2250 progress |= local_progress;
2251 }
2252 } while (progress);
2253
2254 /* needs to be outside of optimization loop, otherwise it fights with
2255 * opt_algebraic optimizing the conversion lowering
2256 */
2257 NIR_PASS(progress, s, v3d_nir_lower_algebraic, c);
2258 NIR_PASS(progress, s, nir_opt_cse);
2259
2260 nir_move_options sink_opts =
2261 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2262 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2263 NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2264 }
2265
2266 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2267 driver_location_compare(const nir_variable *a, const nir_variable *b)
2268 {
2269 return a->data.driver_location == b->data.driver_location ?
2270 a->data.location_frac - b->data.location_frac :
2271 a->data.driver_location - b->data.driver_location;
2272 }
2273
2274 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t num_components)2275 ntq_emit_vpm_read(struct v3d_compile *c, uint32_t num_components)
2276 {
2277 return vir_LDVPMV_IN(c,
2278 vir_uniform_ui(c, num_components));
2279 }
2280
2281 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2282 ntq_setup_vs_inputs(struct v3d_compile *c)
2283 {
2284 /* Figure out how many components of each vertex attribute the shader
2285 * uses. Each variable should have been split to individual
2286 * components and unused ones DCEed. The vertex fetcher will load
2287 * from the start of the attribute to the number of components we
2288 * declare we need in c->vattr_sizes[].
2289 *
2290 * BGRA vertex attributes are a bit special: since we implement these
2291 * as RGBA swapping R/B components we always need at least 3 components
2292 * if component 0 is read.
2293 */
2294 nir_foreach_shader_in_variable(var, c->s) {
2295 /* No VS attribute array support. */
2296 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2297
2298 unsigned loc = var->data.driver_location;
2299 int start_component = var->data.location_frac;
2300 int num_components = glsl_get_components(var->type);
2301
2302 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2303 start_component + num_components);
2304
2305 /* Handle BGRA inputs */
2306 if (start_component == 0 &&
2307 c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2308 c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2309 }
2310 }
2311
2312 uint32_t vpm_components = 0;
2313 bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2314 SYSTEM_VALUE_INSTANCE_ID) ||
2315 BITSET_TEST(c->s->info.system_values_read,
2316 SYSTEM_VALUE_INSTANCE_INDEX);
2317 bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2318 SYSTEM_VALUE_BASE_INSTANCE);
2319 bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2320 SYSTEM_VALUE_VERTEX_ID) ||
2321 BITSET_TEST(c->s->info.system_values_read,
2322 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2323
2324 if (uses_iid)
2325 c->iid = ntq_emit_vpm_read(c, vpm_components++);
2326
2327 if (uses_biid)
2328 c->biid = ntq_emit_vpm_read(c, vpm_components++);
2329
2330 if (uses_vid)
2331 c->vid = ntq_emit_vpm_read(c, vpm_components++);
2332
2333 /* The actual loads will happen directly in nir_intrinsic_load_input
2334 */
2335 return;
2336 }
2337
2338 static bool
program_reads_point_coord(struct v3d_compile * c)2339 program_reads_point_coord(struct v3d_compile *c)
2340 {
2341 nir_foreach_shader_in_variable(var, c->s) {
2342 if (util_varying_is_point_coord(var->data.location,
2343 c->fs_key->point_sprite_mask)) {
2344 return true;
2345 }
2346 }
2347
2348 return false;
2349 }
2350
2351 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2352 ntq_setup_gs_inputs(struct v3d_compile *c)
2353 {
2354 nir_sort_variables_with_modes(c->s, driver_location_compare,
2355 nir_var_shader_in);
2356
2357 nir_foreach_shader_in_variable(var, c->s) {
2358 /* All GS inputs are arrays with as many entries as vertices
2359 * in the input primitive, but here we only care about the
2360 * per-vertex input type.
2361 */
2362 assert(glsl_type_is_array(var->type));
2363 const struct glsl_type *type = glsl_get_array_element(var->type);
2364 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2365 unsigned loc = var->data.driver_location;
2366
2367 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2368 (loc + var_len) * 4);
2369
2370 if (var->data.compact) {
2371 for (unsigned j = 0; j < var_len; j++) {
2372 unsigned input_idx = c->num_inputs++;
2373 unsigned loc_frac = var->data.location_frac + j;
2374 unsigned loc = var->data.location + loc_frac / 4;
2375 unsigned comp = loc_frac % 4;
2376 c->input_slots[input_idx] =
2377 v3d_slot_from_slot_and_component(loc, comp);
2378 }
2379 continue;
2380 }
2381
2382 for (unsigned j = 0; j < var_len; j++) {
2383 unsigned num_elements =
2384 glsl_type_is_struct(glsl_without_array(type)) ?
2385 4 : glsl_get_vector_elements(type);
2386 for (unsigned k = 0; k < num_elements; k++) {
2387 unsigned chan = var->data.location_frac + k;
2388 unsigned input_idx = c->num_inputs++;
2389 struct v3d_varying_slot slot =
2390 v3d_slot_from_slot_and_component(var->data.location + j, chan);
2391 c->input_slots[input_idx] = slot;
2392 }
2393 }
2394 }
2395 }
2396
2397
2398 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2399 ntq_setup_fs_inputs(struct v3d_compile *c)
2400 {
2401 nir_sort_variables_with_modes(c->s, driver_location_compare,
2402 nir_var_shader_in);
2403
2404 nir_foreach_shader_in_variable(var, c->s) {
2405 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2406 unsigned loc = var->data.driver_location;
2407
2408 uint32_t inputs_array_size = c->inputs_array_size;
2409 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2410 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2411 inputs_array_required_size);
2412 resize_interp_array(c, &c->interp, &inputs_array_size,
2413 inputs_array_required_size);
2414
2415 if (var->data.location == VARYING_SLOT_POS) {
2416 emit_fragcoord_input(c, loc);
2417 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2418 !c->fs_key->has_gs) {
2419 /* If the fragment shader reads gl_PrimitiveID and we
2420 * don't have a geometry shader in the pipeline to write
2421 * it then we program the hardware to inject it as
2422 * an implicit varying. Take it from there.
2423 */
2424 c->inputs[loc * 4] = c->primitive_id;
2425 } else if (util_varying_is_point_coord(var->data.location,
2426 c->fs_key->point_sprite_mask)) {
2427 c->inputs[loc * 4 + 0] = c->point_x;
2428 c->inputs[loc * 4 + 1] = c->point_y;
2429 } else if (var->data.compact) {
2430 for (int j = 0; j < var_len; j++)
2431 emit_compact_fragment_input(c, loc, var, j);
2432 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2433 for (int j = 0; j < var_len; j++) {
2434 emit_fragment_input(c, loc, var, j, 4);
2435 }
2436 } else {
2437 for (int j = 0; j < var_len; j++) {
2438 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2439 }
2440 }
2441 }
2442 }
2443
2444 static void
ntq_setup_outputs(struct v3d_compile * c)2445 ntq_setup_outputs(struct v3d_compile *c)
2446 {
2447 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2448 return;
2449
2450 nir_foreach_shader_out_variable(var, c->s) {
2451 assert(glsl_type_is_vector_or_scalar(var->type));
2452 unsigned loc = var->data.driver_location * 4;
2453
2454 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2455 add_output(c, loc + var->data.location_frac + i,
2456 var->data.location,
2457 var->data.location_frac + i);
2458 }
2459
2460 switch (var->data.location) {
2461 case FRAG_RESULT_COLOR:
2462 for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
2463 c->output_color_var[i] = var;
2464 break;
2465 case FRAG_RESULT_DATA0:
2466 case FRAG_RESULT_DATA1:
2467 case FRAG_RESULT_DATA2:
2468 case FRAG_RESULT_DATA3:
2469 case FRAG_RESULT_DATA4:
2470 case FRAG_RESULT_DATA5:
2471 case FRAG_RESULT_DATA6:
2472 case FRAG_RESULT_DATA7:
2473 c->output_color_var[var->data.location -
2474 FRAG_RESULT_DATA0] = var;
2475 break;
2476 case FRAG_RESULT_DEPTH:
2477 c->output_position_index = loc;
2478 break;
2479 case FRAG_RESULT_SAMPLE_MASK:
2480 c->output_sample_mask_index = loc;
2481 break;
2482 }
2483 }
2484 }
2485
2486 /**
2487 * Sets up the mapping from nir_register to struct qreg *.
2488 *
2489 * Each nir_register gets a struct qreg per 32-bit component being stored.
2490 */
2491 static void
ntq_setup_registers(struct v3d_compile * c,nir_function_impl * impl)2492 ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
2493 {
2494 nir_foreach_reg_decl(decl, impl) {
2495 unsigned num_components = nir_intrinsic_num_components(decl);
2496 unsigned array_len = nir_intrinsic_num_array_elems(decl);
2497 array_len = MAX2(array_len, 1);
2498 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2499 array_len * num_components);
2500
2501 nir_def *nir_reg = &decl->def;
2502 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2503
2504 for (int i = 0; i < array_len * num_components; i++)
2505 qregs[i] = vir_get_temp(c);
2506 }
2507 }
2508
2509 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2510 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2511 {
2512 /* XXX perf: Experiment with using immediate loads to avoid having
2513 * these end up in the uniform stream. Watch out for breaking the
2514 * small immediates optimization in the process!
2515 */
2516 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2517 for (int i = 0; i < instr->def.num_components; i++)
2518 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2519
2520 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2521 }
2522
2523 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2524 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2525 {
2526 unsigned image_index = nir_src_as_uint(instr->src[0]);
2527 bool is_array = nir_intrinsic_image_array(instr);
2528
2529 assert(nir_src_as_uint(instr->src[1]) == 0);
2530
2531 ntq_store_def(c, &instr->def, 0,
2532 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2533 if (instr->num_components > 1) {
2534 ntq_store_def(c, &instr->def, 1,
2535 vir_uniform(c,
2536 instr->num_components == 2 && is_array ?
2537 QUNIFORM_IMAGE_ARRAY_SIZE :
2538 QUNIFORM_IMAGE_HEIGHT,
2539 image_index));
2540 }
2541 if (instr->num_components > 2) {
2542 ntq_store_def(c, &instr->def, 2,
2543 vir_uniform(c,
2544 is_array ?
2545 QUNIFORM_IMAGE_ARRAY_SIZE :
2546 QUNIFORM_IMAGE_DEPTH,
2547 image_index));
2548 }
2549 }
2550
2551 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2552 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2553 {
2554 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2555
2556 int rt = nir_src_as_uint(instr->src[0]);
2557 assert(rt < V3D_MAX_DRAW_BUFFERS);
2558
2559 int sample_index = nir_intrinsic_base(instr) ;
2560 assert(sample_index < V3D_MAX_SAMPLES);
2561
2562 int component = nir_intrinsic_component(instr);
2563 assert(component < 4);
2564
2565 /* We need to emit our TLB reads after we have acquired the scoreboard
2566 * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2567 * the last thread switch to improve parallelism, however, that is only
2568 * guaranteed to happen before the tlb color writes.
2569 *
2570 * To fix that, we make sure we always emit a thread switch before the
2571 * first tlb color read. If that happens to be the last thread switch
2572 * we emit, then everything is fine, but otherwise, if any code after
2573 * this point needs to emit additional thread switches, then we will
2574 * switch the strategy to locking the scoreboard on the first thread
2575 * switch instead -- see vir_emit_thrsw().
2576 */
2577 if (!c->emitted_tlb_load) {
2578 if (!c->last_thrsw_at_top_level)
2579 vir_emit_thrsw(c);
2580
2581 c->emitted_tlb_load = true;
2582 }
2583
2584 struct qreg *color_reads_for_sample =
2585 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2586
2587 if (color_reads_for_sample[component].file == QFILE_NULL) {
2588 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2589 int num_components =
2590 util_format_get_nr_components(rt_format);
2591
2592 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2593 if (swap_rb)
2594 num_components = MAX2(num_components, 3);
2595
2596 nir_variable *var = c->output_color_var[rt];
2597 enum glsl_base_type type = glsl_get_base_type(var->type);
2598
2599 bool is_int_format = type == GLSL_TYPE_INT ||
2600 type == GLSL_TYPE_UINT;
2601
2602 bool is_32b_tlb_format = is_int_format ||
2603 (c->fs_key->f32_color_rb & (1 << rt));
2604
2605 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2606
2607 uint32_t conf = 0xffffff00;
2608 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2609 TLB_SAMPLE_MODE_PER_PIXEL;
2610 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2611
2612 if (is_32b_tlb_format) {
2613 /* The F32 vs I32 distinction was dropped in 4.2. */
2614 conf |= (c->devinfo->ver < 42 && is_int_format) ?
2615 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2616
2617 conf |= ((num_components - 1) <<
2618 TLB_VEC_SIZE_MINUS_1_SHIFT);
2619 } else {
2620 conf |= TLB_TYPE_F16_COLOR;
2621 conf |= TLB_F16_SWAP_HI_LO;
2622
2623 if (num_components >= 3)
2624 conf |= TLB_VEC_SIZE_4_F16;
2625 else
2626 conf |= TLB_VEC_SIZE_2_F16;
2627 }
2628
2629
2630 for (int i = 0; i < num_samples; i++) {
2631 struct qreg r, g, b, a;
2632 if (is_32b_tlb_format) {
2633 r = conf != 0xffffffff && i == 0?
2634 vir_TLBU_COLOR_READ(c, conf) :
2635 vir_TLB_COLOR_READ(c);
2636 if (num_components >= 2)
2637 g = vir_TLB_COLOR_READ(c);
2638 if (num_components >= 3)
2639 b = vir_TLB_COLOR_READ(c);
2640 if (num_components >= 4)
2641 a = vir_TLB_COLOR_READ(c);
2642 } else {
2643 struct qreg rg = conf != 0xffffffff && i == 0 ?
2644 vir_TLBU_COLOR_READ(c, conf) :
2645 vir_TLB_COLOR_READ(c);
2646 r = vir_FMOV(c, rg);
2647 vir_set_unpack(c->defs[r.index], 0,
2648 V3D_QPU_UNPACK_L);
2649 g = vir_FMOV(c, rg);
2650 vir_set_unpack(c->defs[g.index], 0,
2651 V3D_QPU_UNPACK_H);
2652
2653 if (num_components > 2) {
2654 struct qreg ba = vir_TLB_COLOR_READ(c);
2655 b = vir_FMOV(c, ba);
2656 vir_set_unpack(c->defs[b.index], 0,
2657 V3D_QPU_UNPACK_L);
2658 a = vir_FMOV(c, ba);
2659 vir_set_unpack(c->defs[a.index], 0,
2660 V3D_QPU_UNPACK_H);
2661 }
2662 }
2663
2664 struct qreg *color_reads =
2665 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2666
2667 color_reads[0] = swap_rb ? b : r;
2668 if (num_components >= 2)
2669 color_reads[1] = g;
2670 if (num_components >= 3)
2671 color_reads[2] = swap_rb ? r : b;
2672 if (num_components >= 4)
2673 color_reads[3] = a;
2674 }
2675 }
2676
2677 assert(color_reads_for_sample[component].file != QFILE_NULL);
2678 ntq_store_def(c, &instr->def, 0,
2679 vir_MOV(c, color_reads_for_sample[component]));
2680 }
2681
2682 static bool
2683 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2684
2685 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_def * def,enum quniform_contents contents)2686 try_emit_uniform(struct v3d_compile *c,
2687 int offset,
2688 int num_components,
2689 nir_def *def,
2690 enum quniform_contents contents)
2691 {
2692 /* Even though ldunif is strictly 32-bit we can still use it
2693 * to load scalar 8-bit/16-bit uniforms so long as their offset
2694 * is 32-bit aligned. In this case, ldunif would still load
2695 * 32-bit into the destination with the 8-bit/16-bit uniform
2696 * data in the LSB and garbage in the MSB, but that is fine
2697 * because we should only be accessing the valid bits of the
2698 * destination.
2699 *
2700 * FIXME: if in the future we improve our register allocator to
2701 * pack 2 16-bit variables in the MSB and LSB of the same
2702 * register then this optimization would not be valid as is,
2703 * since the load clobbers the MSB.
2704 */
2705 if (offset % 4 != 0)
2706 return false;
2707
2708 /* We need dwords */
2709 offset = offset / 4;
2710
2711 for (int i = 0; i < num_components; i++) {
2712 ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
2713 }
2714
2715 return true;
2716 }
2717
2718 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2719 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2720 {
2721 /* We scalarize general TMU access for anything that is not 32-bit. */
2722 assert(instr->def.bit_size == 32 ||
2723 instr->num_components == 1);
2724
2725 /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2726 if (nir_src_is_const(instr->src[0])) {
2727 int offset = (nir_intrinsic_base(instr) +
2728 nir_src_as_uint(instr->src[0]));
2729
2730 if (try_emit_uniform(c, offset, instr->num_components,
2731 &instr->def, QUNIFORM_UNIFORM)) {
2732 return;
2733 }
2734 }
2735
2736 if (!ntq_emit_load_unifa(c, instr)) {
2737 ntq_emit_tmu_general(c, instr, false, false);
2738 c->has_general_tmu_load = true;
2739 }
2740 }
2741
2742 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2743 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2744 {
2745 if (c->compiler->max_inline_uniform_buffers <= 0)
2746 return false;
2747
2748 /* Regular UBOs start after inline UBOs */
2749 uint32_t index = nir_src_as_uint(instr->src[0]);
2750 if (index >= c->compiler->max_inline_uniform_buffers)
2751 return false;
2752
2753 /* We scalarize general TMU access for anything that is not 32-bit */
2754 assert(instr->def.bit_size == 32 ||
2755 instr->num_components == 1);
2756
2757 if (nir_src_is_const(instr->src[1])) {
2758 int offset = nir_src_as_uint(instr->src[1]);
2759 if (try_emit_uniform(c, offset, instr->num_components,
2760 &instr->def,
2761 QUNIFORM_INLINE_UBO_0 + index)) {
2762 return true;
2763 }
2764 }
2765
2766 /* Fallback to regular UBO load */
2767 return false;
2768 }
2769
2770 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2771 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2772 {
2773 /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2774 *
2775 * Right now the driver sets support_indirect_inputs even
2776 * if we don't support non-uniform offsets because we also set the
2777 * lower_all_io_to_temps option in the NIR compiler. This ensures that
2778 * any indirect indexing on in/out variables is turned into indirect
2779 * indexing on temporary variables instead, that we handle by lowering
2780 * to scratch. If we implement non-uniform offset here we might be able
2781 * to avoid the temp and scratch lowering, which involves copying from
2782 * the input to the temp variable, possibly making code more optimal.
2783 */
2784 unsigned offset =
2785 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2786
2787 if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2788 /* Emit the LDVPM directly now, rather than at the top
2789 * of the shader like we did for V3D 3.x (which needs
2790 * vpmsetup when not just taking the next offset).
2791 *
2792 * Note that delaying like this may introduce stalls,
2793 * as LDVPMV takes a minimum of 1 instruction but may
2794 * be slower if the VPM unit is busy with another QPU.
2795 */
2796 int index = 0;
2797 if (BITSET_TEST(c->s->info.system_values_read,
2798 SYSTEM_VALUE_INSTANCE_ID)) {
2799 index++;
2800 }
2801 if (BITSET_TEST(c->s->info.system_values_read,
2802 SYSTEM_VALUE_BASE_INSTANCE)) {
2803 index++;
2804 }
2805 if (BITSET_TEST(c->s->info.system_values_read,
2806 SYSTEM_VALUE_VERTEX_ID)) {
2807 index++;
2808 }
2809
2810 for (int i = 0; i < offset; i++) {
2811 /* GFXH-1602: if any builtins (vid, iid, etc) are read then
2812 * attribute 0 must be active (size > 0). When we hit this,
2813 * the driver is expected to program attribute 0 to have a
2814 * size of 1, so here we need to add that.
2815 */
2816 if (i == 0 && c->vs_key->is_coord &&
2817 c->vattr_sizes[i] == 0 && index > 0) {
2818 index++;
2819 } else {
2820 index += c->vattr_sizes[i];
2821 }
2822 }
2823
2824 index += nir_intrinsic_component(instr);
2825 for (int i = 0; i < instr->num_components; i++) {
2826 struct qreg vpm_offset = vir_uniform_ui(c, index++);
2827 ntq_store_def(c, &instr->def, i,
2828 vir_LDVPMV_IN(c, vpm_offset));
2829 }
2830 } else {
2831 for (int i = 0; i < instr->num_components; i++) {
2832 int comp = nir_intrinsic_component(instr) + i;
2833 struct qreg input = c->inputs[offset * 4 + comp];
2834 ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
2835
2836 if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2837 input.file == c->payload_z.file &&
2838 input.index == c->payload_z.index) {
2839 c->reads_z = true;
2840 }
2841 }
2842 }
2843 }
2844
2845 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2846 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2847 nir_intrinsic_instr *instr)
2848 {
2849 assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2850
2851 unsigned rt = nir_src_as_uint(instr->src[1]);
2852 assert(rt < V3D_MAX_DRAW_BUFFERS);
2853
2854 unsigned sample_idx = nir_intrinsic_base(instr);
2855 assert(sample_idx < V3D_MAX_SAMPLES);
2856
2857 unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2858 for (int i = 0; i < instr->num_components; i++) {
2859 c->sample_colors[offset + i] =
2860 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2861 }
2862 }
2863
2864 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2865 ntq_emit_color_write(struct v3d_compile *c,
2866 nir_intrinsic_instr *instr)
2867 {
2868 unsigned offset = (nir_intrinsic_base(instr) +
2869 nir_src_as_uint(instr->src[1])) * 4 +
2870 nir_intrinsic_component(instr);
2871 for (int i = 0; i < instr->num_components; i++) {
2872 c->outputs[offset + i] =
2873 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2874 }
2875 }
2876
2877 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2878 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2879 {
2880 assert(instr->num_components == 1);
2881
2882 struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2883
2884 uint32_t base_offset = nir_intrinsic_base(instr);
2885
2886 if (base_offset)
2887 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2888
2889 /* Usually, for VS or FS, we only emit outputs once at program end so
2890 * our VPM writes are never in non-uniform control flow, but this
2891 * is not true for GS, where we are emitting multiple vertices.
2892 */
2893 if (vir_in_nonuniform_control_flow(c)) {
2894 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2895 V3D_QPU_PF_PUSHZ);
2896 }
2897
2898 struct qreg val = ntq_get_src(c, instr->src[0], 0);
2899
2900 /* The offset isn’t necessarily dynamically uniform for a geometry
2901 * shader. This can happen if the shader sometimes doesn’t emit one of
2902 * the vertices. In that case subsequent vertices will be written to
2903 * different offsets in the VPM and we need to use the scatter write
2904 * instruction to have a different offset for each lane.
2905 */
2906 bool is_uniform_offset =
2907 !vir_in_nonuniform_control_flow(c) &&
2908 !nir_src_is_divergent(&instr->src[1]);
2909 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2910
2911 if (vir_in_nonuniform_control_flow(c)) {
2912 struct qinst *last_inst =
2913 (struct qinst *)c->cur_block->instructions.prev;
2914 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2915 }
2916 }
2917
2918 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2919 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2920 {
2921 assert(c->s->info.stage == MESA_SHADER_VERTEX);
2922 assert(instr->num_components == 1);
2923
2924 uint32_t base = nir_intrinsic_base(instr);
2925 struct qreg val = ntq_get_src(c, instr->src[0], 0);
2926
2927 if (nir_src_is_const(instr->src[1])) {
2928 vir_VPM_WRITE(c, val,
2929 base + nir_src_as_uint(instr->src[1]));
2930 } else {
2931 struct qreg offset = vir_ADD(c,
2932 ntq_get_src(c, instr->src[1], 1),
2933 vir_uniform_ui(c, base));
2934 bool is_uniform_offset =
2935 !vir_in_nonuniform_control_flow(c) &&
2936 !nir_src_is_divergent(&instr->src[1]);
2937 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2938 }
2939 }
2940
2941 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2942 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2943 {
2944 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2945 ntq_emit_color_write(c, instr);
2946 else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2947 emit_store_output_gs(c, instr);
2948 else
2949 emit_store_output_vs(c, instr);
2950 }
2951
2952 /**
2953 * This implementation is based on v3d_sample_{x,y}_offset() from
2954 * v3d_sample_offset.h.
2955 */
2956 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2957 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2958 struct qreg *sx, struct qreg *sy)
2959 {
2960 sample_idx = vir_ITOF(c, sample_idx);
2961
2962 struct qreg offset_x =
2963 vir_FADD(c, vir_uniform_f(c, -0.125f),
2964 vir_FMUL(c, sample_idx,
2965 vir_uniform_f(c, 0.5f)));
2966 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2967 vir_uniform_f(c, 2.0f), sample_idx),
2968 V3D_QPU_PF_PUSHC);
2969 offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2970 vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2971 offset_x);
2972
2973 struct qreg offset_y =
2974 vir_FADD(c, vir_uniform_f(c, -0.375f),
2975 vir_FMUL(c, sample_idx,
2976 vir_uniform_f(c, 0.25f)));
2977 *sx = offset_x;
2978 *sy = offset_y;
2979 }
2980
2981 /**
2982 * This implementation is based on get_centroid_offset() from fep.c.
2983 */
2984 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2985 ntq_get_barycentric_centroid(struct v3d_compile *c,
2986 struct qreg *out_x,
2987 struct qreg *out_y)
2988 {
2989 struct qreg sample_mask;
2990 if (c->output_sample_mask_index != -1)
2991 sample_mask = c->outputs[c->output_sample_mask_index];
2992 else
2993 sample_mask = vir_MSF(c);
2994
2995 struct qreg i0 = vir_uniform_ui(c, 0);
2996 struct qreg i1 = vir_uniform_ui(c, 1);
2997 struct qreg i2 = vir_uniform_ui(c, 2);
2998 struct qreg i3 = vir_uniform_ui(c, 3);
2999 struct qreg i4 = vir_uniform_ui(c, 4);
3000 struct qreg i8 = vir_uniform_ui(c, 8);
3001
3002 /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
3003 struct qreg F = vir_uniform_ui(c, 0);
3004 struct qreg T = vir_uniform_ui(c, ~0);
3005 struct qreg s0 = vir_AND(c, sample_mask, i1);
3006 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3007 s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3008 struct qreg s1 = vir_AND(c, sample_mask, i2);
3009 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3010 s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3011 struct qreg s2 = vir_AND(c, sample_mask, i4);
3012 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3013 s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3014 struct qreg s3 = vir_AND(c, sample_mask, i8);
3015 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
3016 s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
3017
3018 /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
3019 struct qreg sample_idx = i3;
3020 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3021 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
3022 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3023 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
3024 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3025 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
3026
3027 /* Get offset at selected sample index */
3028 struct qreg offset_x, offset_y;
3029 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3030
3031 /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
3032 * are selected.
3033 */
3034 struct qreg s0_and_s3 = vir_AND(c, s0, s3);
3035 struct qreg s1_and_s2 = vir_AND(c, s1, s2);
3036
3037 struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
3038 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3039 use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
3040 use_center = vir_OR(c, use_center, s0_and_s3);
3041 use_center = vir_OR(c, use_center, s1_and_s2);
3042
3043 struct qreg zero = vir_uniform_f(c, 0.0f);
3044 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3045 offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
3046 offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
3047
3048 *out_x = offset_x;
3049 *out_y = offset_y;
3050 }
3051
3052 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)3053 ntq_emit_load_interpolated_input(struct v3d_compile *c,
3054 struct qreg p,
3055 struct qreg C,
3056 struct qreg offset_x,
3057 struct qreg offset_y,
3058 unsigned mode)
3059 {
3060 if (mode == INTERP_MODE_FLAT)
3061 return C;
3062
3063 struct qreg sample_offset_x =
3064 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3065 struct qreg sample_offset_y =
3066 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3067
3068 struct qreg scaleX =
3069 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
3070 offset_x);
3071 struct qreg scaleY =
3072 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
3073 offset_y);
3074
3075 struct qreg pInterp =
3076 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3077 vir_FMUL(c, vir_FDY(c, p), scaleY)));
3078
3079 if (mode == INTERP_MODE_NOPERSPECTIVE)
3080 return vir_FADD(c, pInterp, C);
3081
3082 struct qreg w = c->payload_w;
3083 struct qreg wInterp =
3084 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3085 vir_FMUL(c, vir_FDY(c, w), scaleY)));
3086
3087 return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3088 }
3089
3090 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3091 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3092 {
3093 struct qinst *ldunifa =
3094 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3095 ldunifa->qpu.sig.ldunifa = true;
3096 if (result)
3097 *result = vir_emit_def(c, ldunifa);
3098 else
3099 vir_emit_nondef(c, ldunifa);
3100 c->current_unifa_offset += 4;
3101 }
3102
3103 /* Checks if the value of a nir src is derived from a nir register */
3104 static bool
nir_src_derived_from_reg(nir_src src)3105 nir_src_derived_from_reg(nir_src src)
3106 {
3107 nir_def *def = src.ssa;
3108 if (nir_load_reg_for_def(def))
3109 return true;
3110
3111 nir_instr *parent = def->parent_instr;
3112 switch (parent->type) {
3113 case nir_instr_type_alu: {
3114 nir_alu_instr *alu = nir_instr_as_alu(parent);
3115 int num_srcs = nir_op_infos[alu->op].num_inputs;
3116 for (int i = 0; i < num_srcs; i++) {
3117 if (nir_src_derived_from_reg(alu->src[i].src))
3118 return true;
3119 }
3120 return false;
3121 }
3122 case nir_instr_type_intrinsic: {
3123 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3124 int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
3125 for (int i = 0; i < num_srcs; i++) {
3126 if (nir_src_derived_from_reg(intr->src[i]))
3127 return true;
3128 }
3129 return false;
3130 }
3131 case nir_instr_type_load_const:
3132 case nir_instr_type_undef:
3133 return false;
3134 default:
3135 /* By default we assume it may come from a register, the above
3136 * cases should be able to handle the majority of situations
3137 * though.
3138 */
3139 return true;
3140 };
3141 }
3142
3143 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3144 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3145 {
3146 assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3147 instr->intrinsic == nir_intrinsic_load_ssbo ||
3148 instr->intrinsic == nir_intrinsic_load_uniform);
3149
3150 bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3151 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3152 bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3153
3154 /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3155 * current unifa offset is 4 bytes ahead of the offset of the last load.
3156 */
3157 static const int32_t max_unifa_skip_dist =
3158 MAX_UNIFA_SKIP_DISTANCE - 4;
3159
3160 /* We can only use unifa if the offset is uniform */
3161 nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3162 if (nir_src_is_divergent(&offset))
3163 return false;
3164
3165 /* Emitting loads from unifa may not be safe under non-uniform control
3166 * flow. It seems the address that is used to write to the unifa
3167 * register is taken from the first lane and if that lane is disabled
3168 * by control flow then the value we read may be bogus and lead to
3169 * invalid memory accesses on follow-up ldunifa instructions. However,
3170 * ntq_store_def only emits conditional writes for nir registersas long
3171 * we can be certain that the offset isn't derived from a load_reg we
3172 * should be fine.
3173 *
3174 * The following CTS test can be used to trigger the problem, which
3175 * causes a GMP violations in the sim without this check:
3176 * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
3177 */
3178 if (vir_in_nonuniform_control_flow(c) &&
3179 nir_src_derived_from_reg(offset)) {
3180 return false;
3181 }
3182
3183 /* We can only use unifa with SSBOs if they are read-only. Otherwise
3184 * ldunifa won't see the shader writes to that address (possibly
3185 * because ldunifa doesn't read from the L2T cache).
3186 */
3187 if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3188 return false;
3189
3190 /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3191 * that we may have been written to scratch using the TMU.
3192 */
3193 bool dynamic_src = !nir_src_is_const(offset);
3194 if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3195 return false;
3196
3197 uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3198 if (is_uniform)
3199 const_offset += nir_intrinsic_base(instr);
3200
3201 /* ldunifa is a 32-bit load instruction so we can only use it with
3202 * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3203 * except for types smaller than 32-bit, so in these cases we can only
3204 * use ldunifa if we can verify alignment, which we can only do for
3205 * loads with a constant offset.
3206 */
3207 uint32_t bit_size = instr->def.bit_size;
3208 uint32_t value_skips = 0;
3209 if (bit_size < 32) {
3210 if (dynamic_src) {
3211 return false;
3212 } else if (const_offset % 4 != 0) {
3213 /* If we are loading from an unaligned offset, fix
3214 * alignment and skip over unused elements in result.
3215 */
3216 value_skips = (const_offset % 4) / (bit_size / 8);
3217 const_offset &= ~0x3;
3218 }
3219 }
3220
3221 assert((bit_size == 32 && value_skips == 0) ||
3222 (bit_size == 16 && value_skips <= 1) ||
3223 (bit_size == 8 && value_skips <= 3));
3224
3225 /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3226 * constants.
3227 */
3228 uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3229
3230 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
3231 * index 0 for Gallium's constant buffer (GL) or push constants
3232 * (Vulkan).
3233 */
3234 if (is_ubo)
3235 index++;
3236
3237 /* We can only keep track of the last unifa address we used with
3238 * constant offset loads. If the new load targets the same buffer and
3239 * is close enough to the previous load, we can skip the unifa register
3240 * write by emitting dummy ldunifa instructions to update the unifa
3241 * address.
3242 */
3243 bool skip_unifa = false;
3244 uint32_t ldunifa_skips = 0;
3245 if (dynamic_src) {
3246 c->current_unifa_block = NULL;
3247 } else if (c->cur_block == c->current_unifa_block &&
3248 c->current_unifa_is_ubo == !is_ssbo &&
3249 c->current_unifa_index == index &&
3250 c->current_unifa_offset <= const_offset &&
3251 c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3252 skip_unifa = true;
3253 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3254 } else {
3255 c->current_unifa_block = c->cur_block;
3256 c->current_unifa_is_ubo = !is_ssbo;
3257 c->current_unifa_index = index;
3258 c->current_unifa_offset = const_offset;
3259 }
3260
3261 if (!skip_unifa) {
3262 struct qreg base_offset = !is_ssbo ?
3263 vir_uniform(c, QUNIFORM_UBO_ADDR,
3264 v3d_unit_data_create(index, const_offset)) :
3265 vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3266
3267 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3268 if (!dynamic_src) {
3269 if (!is_ssbo) {
3270 /* Avoid the extra MOV to UNIFA by making
3271 * ldunif load directly into it. We can't
3272 * do this if we have not actually emitted
3273 * ldunif and are instead reusing a previous
3274 * one.
3275 */
3276 struct qinst *inst =
3277 (struct qinst *)c->cur_block->instructions.prev;
3278 if (inst == c->defs[base_offset.index]) {
3279 inst->dst = unifa;
3280 c->defs[base_offset.index] = NULL;
3281 } else {
3282 vir_MOV_dest(c, unifa, base_offset);
3283 }
3284 } else {
3285 if (const_offset != 0) {
3286 vir_ADD_dest(c, unifa, base_offset,
3287 vir_uniform_ui(c, const_offset));
3288 } else {
3289 vir_MOV_dest(c, unifa, base_offset);
3290 }
3291 }
3292 } else {
3293 vir_ADD_dest(c, unifa, base_offset,
3294 ntq_get_src(c, offset, 0));
3295 }
3296 } else {
3297 for (int i = 0; i < ldunifa_skips; i++)
3298 emit_ldunifa(c, NULL);
3299 }
3300
3301 uint32_t num_components = nir_intrinsic_dest_components(instr);
3302 for (uint32_t i = 0; i < num_components; ) {
3303 struct qreg data;
3304 emit_ldunifa(c, &data);
3305
3306 if (bit_size == 32) {
3307 assert(value_skips == 0);
3308 ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
3309 i++;
3310 } else {
3311 assert((bit_size == 16 && value_skips <= 1) ||
3312 (bit_size == 8 && value_skips <= 3));
3313
3314 /* If we have any values to skip, shift to the first
3315 * valid value in the ldunifa result.
3316 */
3317 if (value_skips > 0) {
3318 data = vir_SHR(c, data,
3319 vir_uniform_ui(c, bit_size *
3320 value_skips));
3321 }
3322
3323 /* Check how many valid components we have discounting
3324 * read components to skip.
3325 */
3326 uint32_t valid_count = (32 / bit_size) - value_skips;
3327 assert((bit_size == 16 && valid_count <= 2) ||
3328 (bit_size == 8 && valid_count <= 4));
3329 assert(valid_count > 0);
3330
3331 /* Process the valid components */
3332 do {
3333 struct qreg tmp;
3334 uint32_t mask = (1 << bit_size) - 1;
3335 tmp = vir_AND(c, vir_MOV(c, data),
3336 vir_uniform_ui(c, mask));
3337 ntq_store_def(c, &instr->def, i,
3338 vir_MOV(c, tmp));
3339 i++;
3340 valid_count--;
3341
3342 /* Shift to next component */
3343 if (i < num_components && valid_count > 0) {
3344 data = vir_SHR(c, data,
3345 vir_uniform_ui(c, bit_size));
3346 }
3347 } while (i < num_components && valid_count > 0);
3348 }
3349 }
3350
3351 return true;
3352 }
3353
3354 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3355 emit_load_local_invocation_index(struct v3d_compile *c)
3356 {
3357 return vir_SHR(c, c->cs_payload[1],
3358 vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3359 }
3360
3361 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
3362 * fragment shaders a lane is considered active if any sample flags are set
3363 * for *any* lane in the same quad, however, we still need to ensure that
3364 * terminated lanes (OpTerminate) are not included. Further, we also need to
3365 * disable lanes that may be disabled because of non-uniform control
3366 * flow.
3367 */
3368 static enum v3d_qpu_cond
setup_subgroup_control_flow_condition(struct v3d_compile * c)3369 setup_subgroup_control_flow_condition(struct v3d_compile *c)
3370 {
3371 assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
3372 c->s->info.stage == MESA_SHADER_COMPUTE);
3373
3374 enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
3375
3376 /* We need to make sure that terminated lanes in fragment shaders are
3377 * not included. We can identify these lanes by comparing the inital
3378 * sample mask with the current. This fixes:
3379 * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
3380 */
3381 if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
3382 vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
3383 vir_NOT(c, vir_XOR(c, c->start_msf,
3384 vir_MSF(c)))),
3385 V3D_QPU_PF_PUSHZ);
3386 cond = V3D_QPU_COND_IFNA;
3387 }
3388
3389 /* If we are in non-uniform control-flow update the condition to
3390 * also limit lanes to those in the current execution mask.
3391 */
3392 if (vir_in_nonuniform_control_flow(c)) {
3393 if (cond == V3D_QPU_COND_IFNA) {
3394 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3395 V3D_QPU_UF_NORNZ);
3396 } else {
3397 assert(cond == V3D_QPU_COND_NONE);
3398 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3399 V3D_QPU_PF_PUSHZ);
3400 }
3401 cond = V3D_QPU_COND_IFA;
3402 }
3403
3404 return cond;
3405 }
3406
3407 static void
emit_compute_barrier(struct v3d_compile * c)3408 emit_compute_barrier(struct v3d_compile *c)
3409 {
3410 /* Ensure we flag the use of the control barrier. NIR's
3411 * gather info pass usually takes care of this, but that
3412 * requires that we call that pass after any other pass
3413 * may emit a control barrier, so this is safer.
3414 */
3415 c->s->info.uses_control_barrier = true;
3416
3417 /* Emit a TSY op to get all invocations in the workgroup
3418 * (actually supergroup) to block until the last
3419 * invocation reaches the TSY op.
3420 */
3421 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
3422 }
3423
3424 static void
emit_barrier(struct v3d_compile * c)3425 emit_barrier(struct v3d_compile *c)
3426 {
3427 struct qreg eidx = vir_EIDX(c);
3428
3429 /* The config for the TSY op should be setup like this:
3430 * - Lane 0: Quorum
3431 * - Lane 2: TSO id
3432 * - Lane 3: TSY opcode
3433 */
3434
3435 /* Lane 0: we want to synchronize across one subgroup. Here we write to
3436 * all lanes unconditionally and will overwrite other lanes below.
3437 */
3438 struct qreg tsy_conf = vir_uniform_ui(c, 1);
3439
3440 /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
3441 * curent QPU index and thread index to ensure we get a unique one for
3442 * this group of invocations in this core.
3443 */
3444 struct qreg tso_id =
3445 vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
3446 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
3447 V3D_QPU_PF_PUSHZ);
3448 vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
3449
3450 /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
3451 struct qreg tsy_op = vir_uniform_ui(c, 16);
3452 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
3453 V3D_QPU_PF_PUSHZ);
3454 vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
3455
3456 /* Emit TSY sync */
3457 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
3458 }
3459
3460 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3461 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3462 {
3463 switch (instr->intrinsic) {
3464 case nir_intrinsic_decl_reg:
3465 case nir_intrinsic_load_reg:
3466 case nir_intrinsic_store_reg:
3467 break; /* Ignore these */
3468
3469 case nir_intrinsic_load_uniform:
3470 ntq_emit_load_uniform(c, instr);
3471 break;
3472
3473 case nir_intrinsic_load_global:
3474 case nir_intrinsic_load_global_constant:
3475 ntq_emit_tmu_general(c, instr, false, true);
3476 c->has_general_tmu_load = true;
3477 break;
3478
3479 case nir_intrinsic_load_ubo:
3480 if (ntq_emit_inline_ubo_load(c, instr))
3481 break;
3482 FALLTHROUGH;
3483 case nir_intrinsic_load_ssbo:
3484 if (!ntq_emit_load_unifa(c, instr)) {
3485 ntq_emit_tmu_general(c, instr, false, false);
3486 c->has_general_tmu_load = true;
3487 }
3488 break;
3489
3490 case nir_intrinsic_store_ssbo:
3491 case nir_intrinsic_ssbo_atomic:
3492 case nir_intrinsic_ssbo_atomic_swap:
3493 ntq_emit_tmu_general(c, instr, false, false);
3494 break;
3495
3496 case nir_intrinsic_store_global:
3497 case nir_intrinsic_global_atomic:
3498 case nir_intrinsic_global_atomic_swap:
3499 ntq_emit_tmu_general(c, instr, false, true);
3500 break;
3501
3502 case nir_intrinsic_shared_atomic:
3503 case nir_intrinsic_shared_atomic_swap:
3504 case nir_intrinsic_store_shared:
3505 case nir_intrinsic_store_scratch:
3506 ntq_emit_tmu_general(c, instr, true, false);
3507 break;
3508
3509 case nir_intrinsic_load_scratch:
3510 case nir_intrinsic_load_shared:
3511 ntq_emit_tmu_general(c, instr, true, false);
3512 c->has_general_tmu_load = true;
3513 break;
3514
3515 case nir_intrinsic_image_store:
3516 case nir_intrinsic_image_atomic:
3517 case nir_intrinsic_image_atomic_swap:
3518 v3d_vir_emit_image_load_store(c, instr);
3519 break;
3520
3521 case nir_intrinsic_image_load:
3522 v3d_vir_emit_image_load_store(c, instr);
3523 /* Not really a general TMU load, but we only use this flag
3524 * for NIR scheduling and we do schedule these under the same
3525 * policy as general TMU.
3526 */
3527 c->has_general_tmu_load = true;
3528 break;
3529
3530 case nir_intrinsic_get_ssbo_size:
3531 ntq_store_def(c, &instr->def, 0,
3532 vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3533 nir_src_comp_as_uint(instr->src[0], 0)));
3534 break;
3535
3536 case nir_intrinsic_get_ubo_size:
3537 ntq_store_def(c, &instr->def, 0,
3538 vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3539 nir_src_comp_as_uint(instr->src[0], 0)));
3540 break;
3541
3542 case nir_intrinsic_load_user_clip_plane:
3543 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3544 ntq_store_def(c, &instr->def, i,
3545 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3546 nir_intrinsic_ucp_id(instr) *
3547 4 + i));
3548 }
3549 break;
3550
3551 case nir_intrinsic_load_viewport_x_scale:
3552 ntq_store_def(c, &instr->def, 0,
3553 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3554 break;
3555
3556 case nir_intrinsic_load_viewport_y_scale:
3557 ntq_store_def(c, &instr->def, 0,
3558 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3559 break;
3560
3561 case nir_intrinsic_load_viewport_z_scale:
3562 ntq_store_def(c, &instr->def, 0,
3563 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3564 break;
3565
3566 case nir_intrinsic_load_viewport_z_offset:
3567 ntq_store_def(c, &instr->def, 0,
3568 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3569 break;
3570
3571 case nir_intrinsic_load_line_coord:
3572 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
3573 break;
3574
3575 case nir_intrinsic_load_line_width:
3576 ntq_store_def(c, &instr->def, 0,
3577 vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3578 break;
3579
3580 case nir_intrinsic_load_aa_line_width:
3581 ntq_store_def(c, &instr->def, 0,
3582 vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3583 break;
3584
3585 case nir_intrinsic_load_sample_mask_in:
3586 ntq_store_def(c, &instr->def, 0, vir_MSF(c));
3587 break;
3588
3589 case nir_intrinsic_load_helper_invocation:
3590 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3591 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3592 ntq_store_def(c, &instr->def, 0, qdest);
3593 break;
3594
3595 case nir_intrinsic_load_front_face:
3596 /* The register contains 0 (front) or 1 (back), and we need to
3597 * turn it into a NIR bool where true means front.
3598 */
3599 ntq_store_def(c, &instr->def, 0,
3600 vir_ADD(c,
3601 vir_uniform_ui(c, -1),
3602 vir_REVF(c)));
3603 break;
3604
3605 case nir_intrinsic_load_base_instance:
3606 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
3607 break;
3608
3609 case nir_intrinsic_load_instance_id:
3610 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
3611 break;
3612
3613 case nir_intrinsic_load_vertex_id:
3614 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
3615 break;
3616
3617 case nir_intrinsic_load_draw_id:
3618 ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
3619 break;
3620
3621 case nir_intrinsic_load_tlb_color_brcm:
3622 vir_emit_tlb_color_read(c, instr);
3623 break;
3624
3625 case nir_intrinsic_load_fep_w_v3d:
3626 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
3627 break;
3628
3629 case nir_intrinsic_load_input:
3630 ntq_emit_load_input(c, instr);
3631 break;
3632
3633 case nir_intrinsic_store_tlb_sample_color_v3d:
3634 ntq_emit_per_sample_color_write(c, instr);
3635 break;
3636
3637 case nir_intrinsic_store_output:
3638 ntq_emit_store_output(c, instr);
3639 break;
3640
3641 case nir_intrinsic_image_size:
3642 ntq_emit_image_size(c, instr);
3643 break;
3644
3645 /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
3646 * is intended to match the semantics of GLSL's discard) should
3647 * terminate the invocation immediately. Our implementation doesn't
3648 * do that. What we do is actually a demote by removing the invocations
3649 * from the sample mask. Maybe we could be more strict and force an
3650 * early termination by emitting a (maybe conditional) jump to the
3651 * end section of the fragment shader for affected invocations.
3652 */
3653 case nir_intrinsic_terminate:
3654 c->emitted_discard = true;
3655 FALLTHROUGH;
3656 case nir_intrinsic_demote:
3657 ntq_flush_tmu(c);
3658
3659 if (vir_in_nonuniform_control_flow(c)) {
3660 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3661 V3D_QPU_PF_PUSHZ);
3662 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3663 vir_uniform_ui(c, 0)),
3664 V3D_QPU_COND_IFA);
3665 } else {
3666 vir_SETMSF_dest(c, vir_nop_reg(),
3667 vir_uniform_ui(c, 0));
3668 }
3669 break;
3670
3671 case nir_intrinsic_terminate_if:
3672 c->emitted_discard = true;
3673 FALLTHROUGH;
3674 case nir_intrinsic_demote_if: {
3675 ntq_flush_tmu(c);
3676
3677 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3678
3679 if (vir_in_nonuniform_control_flow(c)) {
3680 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3681 c->execute);
3682 if (cond == V3D_QPU_COND_IFA) {
3683 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3684 } else {
3685 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3686 cond = V3D_QPU_COND_IFA;
3687 }
3688 }
3689
3690 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3691 vir_uniform_ui(c, 0)), cond);
3692 break;
3693 }
3694
3695 case nir_intrinsic_barrier:
3696 /* Ensure that the TMU operations before the barrier are flushed
3697 * before the ones after the barrier.
3698 */
3699 ntq_flush_tmu(c);
3700
3701 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
3702 if (c->s->info.stage == MESA_SHADER_COMPUTE)
3703 emit_compute_barrier(c);
3704 else
3705 emit_barrier(c);
3706
3707 /* The blocking of a TSY op only happens at the next
3708 * thread switch. No texturing may be outstanding at the
3709 * time of a TSY blocking operation.
3710 */
3711 vir_emit_thrsw(c);
3712 }
3713 break;
3714
3715 case nir_intrinsic_load_num_workgroups:
3716 for (int i = 0; i < 3; i++) {
3717 ntq_store_def(c, &instr->def, i,
3718 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3719 i));
3720 }
3721 break;
3722
3723 case nir_intrinsic_load_workgroup_id: {
3724 struct qreg x = vir_AND(c, c->cs_payload[0],
3725 vir_uniform_ui(c, 0xffff));
3726 ntq_store_def(c, &instr->def, 0, x);
3727
3728 struct qreg y = vir_SHR(c, c->cs_payload[0],
3729 vir_uniform_ui(c, 16));
3730 ntq_store_def(c, &instr->def, 1, y);
3731
3732 struct qreg z = vir_AND(c, c->cs_payload[1],
3733 vir_uniform_ui(c, 0xffff));
3734 ntq_store_def(c, &instr->def, 2, z);
3735 break;
3736 }
3737
3738 case nir_intrinsic_load_base_workgroup_id: {
3739 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
3740 ntq_store_def(c, &instr->def, 0, x);
3741
3742 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
3743 ntq_store_def(c, &instr->def, 1, y);
3744
3745 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
3746 ntq_store_def(c, &instr->def, 2, z);
3747 break;
3748 }
3749
3750 case nir_intrinsic_load_workgroup_size: {
3751 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 0);
3752 ntq_store_def(c, &instr->def, 0, x);
3753
3754 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 1);
3755 ntq_store_def(c, &instr->def, 1, y);
3756
3757 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 2);
3758 ntq_store_def(c, &instr->def, 2, z);
3759 break;
3760 }
3761
3762 case nir_intrinsic_load_local_invocation_index:
3763 ntq_store_def(c, &instr->def, 0,
3764 emit_load_local_invocation_index(c));
3765 break;
3766
3767 case nir_intrinsic_load_subgroup_id: {
3768 /* This is basically the batch index, which is the Local
3769 * Invocation Index divided by the SIMD width).
3770 */
3771 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
3772 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3773 struct qreg lii = emit_load_local_invocation_index(c);
3774 ntq_store_def(c, &instr->def, 0,
3775 vir_SHR(c, lii,
3776 vir_uniform_ui(c, divide_shift)));
3777 break;
3778 }
3779
3780 case nir_intrinsic_load_per_vertex_input: {
3781 /* The vertex shader writes all its used outputs into
3782 * consecutive VPM offsets, so if any output component is
3783 * unused, its VPM offset is used by the next used
3784 * component. This means that we can't assume that each
3785 * location will use 4 consecutive scalar offsets in the VPM
3786 * and we need to compute the VPM offset for each input by
3787 * going through the inputs and finding the one that matches
3788 * our location and component.
3789 *
3790 * col: vertex index, row = varying index
3791 */
3792 assert(nir_src_is_const(instr->src[1]));
3793 uint32_t location =
3794 nir_intrinsic_io_semantics(instr).location +
3795 nir_src_as_uint(instr->src[1]);
3796 uint32_t component = nir_intrinsic_component(instr);
3797
3798 int32_t row_idx = -1;
3799 for (int i = 0; i < c->num_inputs; i++) {
3800 struct v3d_varying_slot slot = c->input_slots[i];
3801 if (v3d_slot_get_slot(slot) == location &&
3802 v3d_slot_get_component(slot) == component) {
3803 row_idx = i;
3804 break;
3805 }
3806 }
3807
3808 assert(row_idx != -1);
3809
3810 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3811 for (int i = 0; i < instr->num_components; i++) {
3812 struct qreg row = vir_uniform_ui(c, row_idx++);
3813 ntq_store_def(c, &instr->def, i,
3814 vir_LDVPMG_IN(c, row, col));
3815 }
3816 break;
3817 }
3818
3819 case nir_intrinsic_emit_vertex:
3820 case nir_intrinsic_end_primitive:
3821 unreachable("Should have been lowered in v3d_nir_lower_io");
3822 break;
3823
3824 case nir_intrinsic_load_primitive_id: {
3825 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3826 * VPM output header. According to docs, we should read this
3827 * using ldvpm(v,d)_in (See Table 71).
3828 */
3829 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3830 ntq_store_def(c, &instr->def, 0,
3831 vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3832 break;
3833 }
3834
3835 case nir_intrinsic_load_invocation_id:
3836 ntq_store_def(c, &instr->def, 0, vir_IID(c));
3837 break;
3838
3839 case nir_intrinsic_load_fb_layers_v3d:
3840 ntq_store_def(c, &instr->def, 0,
3841 vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3842 break;
3843
3844 case nir_intrinsic_load_sample_id:
3845 ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
3846 break;
3847
3848 case nir_intrinsic_load_sample_pos:
3849 ntq_store_def(c, &instr->def, 0,
3850 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3851 ntq_store_def(c, &instr->def, 1,
3852 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3853 break;
3854
3855 case nir_intrinsic_load_barycentric_at_offset:
3856 ntq_store_def(c, &instr->def, 0,
3857 vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3858 ntq_store_def(c, &instr->def, 1,
3859 vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3860 break;
3861
3862 case nir_intrinsic_load_barycentric_pixel:
3863 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3864 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3865 break;
3866
3867 case nir_intrinsic_load_barycentric_at_sample: {
3868 if (!c->fs_key->msaa) {
3869 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3870 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3871 return;
3872 }
3873
3874 struct qreg offset_x, offset_y;
3875 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3876 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3877
3878 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3879 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3880 break;
3881 }
3882
3883 case nir_intrinsic_load_barycentric_sample: {
3884 struct qreg offset_x =
3885 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3886 struct qreg offset_y =
3887 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3888
3889 ntq_store_def(c, &instr->def, 0,
3890 vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3891 ntq_store_def(c, &instr->def, 1,
3892 vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3893 break;
3894 }
3895
3896 case nir_intrinsic_load_barycentric_centroid: {
3897 struct qreg offset_x, offset_y;
3898 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3899 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3900 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3901 break;
3902 }
3903
3904 case nir_intrinsic_load_interpolated_input: {
3905 assert(nir_src_is_const(instr->src[1]));
3906 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3907
3908 for (int i = 0; i < instr->num_components; i++) {
3909 const uint32_t input_idx =
3910 (nir_intrinsic_base(instr) + offset) * 4 +
3911 nir_intrinsic_component(instr) + i;
3912
3913 /* If we are not in MSAA or if we are not interpolating
3914 * a user varying, just return the pre-computed
3915 * interpolated input.
3916 */
3917 if (!c->fs_key->msaa ||
3918 c->interp[input_idx].vp.file == QFILE_NULL) {
3919 ntq_store_def(c, &instr->def, i,
3920 vir_MOV(c, c->inputs[input_idx]));
3921 continue;
3922 }
3923
3924 /* Otherwise compute interpolation at the specified
3925 * offset.
3926 */
3927 struct qreg p = c->interp[input_idx].vp;
3928 struct qreg C = c->interp[input_idx].C;
3929 unsigned interp_mode = c->interp[input_idx].mode;
3930
3931 struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3932 struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3933
3934 struct qreg result =
3935 ntq_emit_load_interpolated_input(c, p, C,
3936 offset_x, offset_y,
3937 interp_mode);
3938 ntq_store_def(c, &instr->def, i, result);
3939 }
3940 break;
3941 }
3942
3943 case nir_intrinsic_load_subgroup_size:
3944 ntq_store_def(c, &instr->def, 0,
3945 vir_uniform_ui(c, V3D_CHANNELS));
3946 break;
3947
3948 case nir_intrinsic_load_subgroup_invocation:
3949 ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
3950 break;
3951
3952 case nir_intrinsic_ddx:
3953 case nir_intrinsic_ddx_coarse:
3954 case nir_intrinsic_ddx_fine: {
3955 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3956 ntq_store_def(c, &instr->def, 0, vir_FDX(c, value));
3957 break;
3958 }
3959
3960 case nir_intrinsic_ddy:
3961 case nir_intrinsic_ddy_coarse:
3962 case nir_intrinsic_ddy_fine: {
3963 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3964 ntq_store_def(c, &instr->def, 0, vir_FDY(c, value));
3965 break;
3966 }
3967
3968 case nir_intrinsic_elect: {
3969 struct qreg first;
3970 if (vir_in_nonuniform_control_flow(c)) {
3971 /* Sets A=1 for lanes enabled in the execution mask */
3972 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3973 V3D_QPU_PF_PUSHZ);
3974 /* Updates A ANDing with lanes enabled in MSF */
3975 vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
3976 V3D_QPU_UF_ANDNZ);
3977 first = vir_FLAFIRST(c);
3978 } else {
3979 /* Sets A=1 for inactive lanes */
3980 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
3981 V3D_QPU_PF_PUSHZ);
3982 first = vir_FLNAFIRST(c);
3983 }
3984
3985 /* Produce a boolean result */
3986 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3987 first, vir_uniform_ui(c, 1)),
3988 V3D_QPU_PF_PUSHZ);
3989 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3990 ntq_store_def(c, &instr->def, 0, result);
3991 break;
3992 }
3993
3994 case nir_intrinsic_ballot: {
3995 assert(c->devinfo->ver >= 71);
3996 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3997 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3998 struct qreg res = vir_get_temp(c);
3999 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
4000 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4001 break;
4002 }
4003
4004 case nir_intrinsic_read_invocation: {
4005 assert(c->devinfo->ver >= 71);
4006 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4007 struct qreg index = ntq_get_src(c, instr->src[1], 0);
4008 struct qreg res = vir_SHUFFLE(c, value, index);
4009 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4010 break;
4011 }
4012
4013 case nir_intrinsic_read_first_invocation: {
4014 assert(c->devinfo->ver >= 71);
4015 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4016 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4017 struct qreg res = vir_get_temp(c);
4018 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
4019 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4020 break;
4021 }
4022
4023 case nir_intrinsic_shuffle: {
4024 assert(c->devinfo->ver >= 71);
4025 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4026 struct qreg indices = ntq_get_src(c, instr->src[1], 0);
4027 struct qreg res = vir_SHUFFLE(c, value, indices);
4028 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4029 break;
4030 }
4031
4032 case nir_intrinsic_vote_feq:
4033 case nir_intrinsic_vote_ieq: {
4034 assert(c->devinfo->ver >= 71);
4035 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4036 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4037 struct qreg res = vir_get_temp(c);
4038 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
4039 vir_ALLEQ_dest(c, res, value) :
4040 vir_ALLFEQ_dest(c, res, value),
4041 cond);
4042
4043 /* Produce boolean result */
4044 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4045 V3D_QPU_PF_PUSHZ);
4046 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4047 ntq_store_def(c, &instr->def, 0, result);
4048 break;
4049 }
4050
4051 case nir_intrinsic_vote_all: {
4052 assert(c->devinfo->ver >= 71);
4053 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4054 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4055 struct qreg res = vir_get_temp(c);
4056 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4057
4058 /* We want to check if 'all lanes are equal (alleq != 0) and
4059 * their value is True (value != 0)'.
4060 *
4061 * The first MOV.pushz generates predicate for 'alleq == 0'.
4062 * The second MOV.NORZ generates predicate for:
4063 * '!(alleq == 0) & !(value == 0).
4064 */
4065 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4066 V3D_QPU_PF_PUSHZ);
4067 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4068 V3D_QPU_UF_NORZ);
4069 struct qreg result =
4070 ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
4071 ntq_store_def(c, &instr->def, 0, result);
4072 break;
4073 }
4074
4075 case nir_intrinsic_vote_any: {
4076 assert(c->devinfo->ver >= 71);
4077 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4078 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4079 struct qreg res = vir_get_temp(c);
4080 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4081
4082 /* We want to check 'not (all lanes are equal (alleq != 0)'
4083 * and their value is False (value == 0))'.
4084 *
4085 * The first MOV.pushz generates predicate for 'alleq == 0'.
4086 * The second MOV.NORNZ generates predicate for:
4087 * '!(alleq == 0) & (value == 0).
4088 * The IFNA condition negates the predicate when evaluated:
4089 * '!(!alleq == 0) & (value == 0))
4090 */
4091 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4092 V3D_QPU_PF_PUSHZ);
4093 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4094 V3D_QPU_UF_NORNZ);
4095 struct qreg result =
4096 ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4097 ntq_store_def(c, &instr->def, 0, result);
4098 break;
4099 }
4100
4101 case nir_intrinsic_load_num_subgroups:
4102 unreachable("Should have been lowered");
4103 break;
4104
4105 case nir_intrinsic_load_view_index:
4106 ntq_store_def(c, &instr->def, 0,
4107 vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
4108 break;
4109
4110 default:
4111 fprintf(stderr, "Unknown intrinsic: ");
4112 nir_print_instr(&instr->instr, stderr);
4113 fprintf(stderr, "\n");
4114 abort();
4115 }
4116 }
4117
4118 /* Clears (activates) the execute flags for any channels whose jump target
4119 * matches this block.
4120 *
4121 * XXX perf: Could we be using flpush/flpop somehow for our execution channel
4122 * enabling?
4123 *
4124 */
4125 static void
ntq_activate_execute_for_block(struct v3d_compile * c)4126 ntq_activate_execute_for_block(struct v3d_compile *c)
4127 {
4128 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4129 c->execute, vir_uniform_ui(c, c->cur_block->index)),
4130 V3D_QPU_PF_PUSHZ);
4131
4132 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4133 }
4134
4135 static bool
is_cheap_block(nir_block * block)4136 is_cheap_block(nir_block *block)
4137 {
4138 int32_t cost = 3;
4139 nir_foreach_instr(instr, block) {
4140 switch (instr->type) {
4141 case nir_instr_type_alu:
4142 case nir_instr_type_undef:
4143 case nir_instr_type_load_const:
4144 if (--cost <= 0)
4145 return false;
4146 break;
4147 case nir_instr_type_intrinsic: {
4148 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4149 switch (intr->intrinsic) {
4150 case nir_intrinsic_decl_reg:
4151 case nir_intrinsic_load_reg:
4152 case nir_intrinsic_store_reg:
4153 continue;
4154 default:
4155 return false;
4156 }
4157 }
4158 default:
4159 return false;
4160 }
4161 }
4162 return true;
4163 }
4164
4165 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)4166 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
4167 {
4168 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4169 bool empty_else_block =
4170 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4171 exec_list_is_empty(&nir_else_block->instr_list));
4172
4173 struct qblock *then_block = vir_new_block(c);
4174 struct qblock *after_block = vir_new_block(c);
4175 struct qblock *else_block;
4176 if (empty_else_block)
4177 else_block = after_block;
4178 else
4179 else_block = vir_new_block(c);
4180
4181 /* Check if this if statement is really just a conditional jump with
4182 * the form:
4183 *
4184 * if (cond) {
4185 * break/continue;
4186 * } else {
4187 * }
4188 *
4189 * In which case we can skip the jump to ELSE we emit before the THEN
4190 * block and instead just emit the break/continue directly.
4191 */
4192 nir_jump_instr *conditional_jump = NULL;
4193 if (empty_else_block) {
4194 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
4195 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
4196 if (inst && inst->type == nir_instr_type_jump)
4197 conditional_jump = nir_instr_as_jump(inst);
4198 }
4199
4200 /* Set up the flags for the IF condition (taking the THEN branch). */
4201 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4202
4203 if (!conditional_jump) {
4204 /* Jump to ELSE. */
4205 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4206 V3D_QPU_BRANCH_COND_ANYNA :
4207 V3D_QPU_BRANCH_COND_ANYA);
4208 /* Pixels that were not dispatched or have been discarded
4209 * should not contribute to the ANYA/ANYNA condition.
4210 */
4211 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4212
4213 vir_link_blocks(c->cur_block, else_block);
4214 vir_link_blocks(c->cur_block, then_block);
4215
4216 /* Process the THEN block. */
4217 vir_set_emit_block(c, then_block);
4218 ntq_emit_cf_list(c, &if_stmt->then_list);
4219
4220 if (!empty_else_block) {
4221 /* At the end of the THEN block, jump to ENDIF, unless
4222 * the block ended in a break or continue.
4223 */
4224 if (!c->cur_block->branch_emitted) {
4225 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4226 vir_link_blocks(c->cur_block, after_block);
4227 }
4228
4229 /* Emit the else block. */
4230 vir_set_emit_block(c, else_block);
4231 ntq_emit_cf_list(c, &if_stmt->else_list);
4232 }
4233 } else {
4234 /* Emit the conditional jump directly.
4235 *
4236 * Use ALL with breaks and ANY with continues to ensure that
4237 * we always break and never continue when all lanes have been
4238 * disabled (for example because of discards) to prevent
4239 * infinite loops.
4240 */
4241 assert(conditional_jump &&
4242 (conditional_jump->type == nir_jump_continue ||
4243 conditional_jump->type == nir_jump_break));
4244
4245 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4246 (conditional_jump->type == nir_jump_break ?
4247 V3D_QPU_BRANCH_COND_ALLA :
4248 V3D_QPU_BRANCH_COND_ANYA) :
4249 (conditional_jump->type == nir_jump_break ?
4250 V3D_QPU_BRANCH_COND_ALLNA :
4251 V3D_QPU_BRANCH_COND_ANYNA));
4252 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4253
4254 vir_link_blocks(c->cur_block,
4255 conditional_jump->type == nir_jump_break ?
4256 c->loop_break_block :
4257 c->loop_cont_block);
4258 }
4259
4260 vir_link_blocks(c->cur_block, after_block);
4261
4262 vir_set_emit_block(c, after_block);
4263 }
4264
4265 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)4266 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
4267 {
4268 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4269 bool empty_else_block =
4270 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4271 exec_list_is_empty(&nir_else_block->instr_list));
4272
4273 struct qblock *then_block = vir_new_block(c);
4274 struct qblock *after_block = vir_new_block(c);
4275 struct qblock *else_block;
4276 if (empty_else_block)
4277 else_block = after_block;
4278 else
4279 else_block = vir_new_block(c);
4280
4281 bool was_uniform_control_flow = false;
4282 if (!vir_in_nonuniform_control_flow(c)) {
4283 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4284 was_uniform_control_flow = true;
4285 }
4286
4287 /* Set up the flags for the IF condition (taking the THEN branch). */
4288 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4289
4290 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
4291 * was previously active (execute Z) for updating the exec flags.
4292 */
4293 if (was_uniform_control_flow) {
4294 cond = v3d_qpu_cond_invert(cond);
4295 } else {
4296 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
4297 if (cond == V3D_QPU_COND_IFA) {
4298 vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
4299 } else {
4300 vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
4301 cond = V3D_QPU_COND_IFA;
4302 }
4303 }
4304
4305 vir_MOV_cond(c, cond,
4306 c->execute,
4307 vir_uniform_ui(c, else_block->index));
4308
4309 /* Set the flags for taking the THEN block */
4310 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4311 V3D_QPU_PF_PUSHZ);
4312
4313 /* Jump to ELSE if nothing is active for THEN (unless THEN block is
4314 * so small it won't pay off), otherwise fall through.
4315 */
4316 bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
4317 is_cheap_block(nir_if_first_then_block(if_stmt));
4318 if (!is_cheap) {
4319 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
4320 vir_link_blocks(c->cur_block, else_block);
4321 }
4322 vir_link_blocks(c->cur_block, then_block);
4323
4324 /* Process the THEN block.
4325 *
4326 * Notice we don't call ntq_activate_execute_for_block here on purpose:
4327 * c->execute is already set up to be 0 for lanes that must take the
4328 * THEN block.
4329 */
4330 vir_set_emit_block(c, then_block);
4331 ntq_emit_cf_list(c, &if_stmt->then_list);
4332
4333 if (!empty_else_block) {
4334 /* Handle the end of the THEN block. First, all currently
4335 * active channels update their execute flags to point to
4336 * ENDIF
4337 */
4338 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4339 V3D_QPU_PF_PUSHZ);
4340 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4341 vir_uniform_ui(c, after_block->index));
4342
4343 /* If everything points at ENDIF, then jump there immediately
4344 * (unless ELSE block is so small it won't pay off).
4345 */
4346 bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
4347 is_cheap_block(nir_else_block);
4348 if (!is_cheap) {
4349 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4350 c->execute,
4351 vir_uniform_ui(c, after_block->index)),
4352 V3D_QPU_PF_PUSHZ);
4353 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
4354 vir_link_blocks(c->cur_block, after_block);
4355 }
4356 vir_link_blocks(c->cur_block, else_block);
4357
4358 vir_set_emit_block(c, else_block);
4359 ntq_activate_execute_for_block(c);
4360 ntq_emit_cf_list(c, &if_stmt->else_list);
4361 }
4362
4363 vir_link_blocks(c->cur_block, after_block);
4364
4365 vir_set_emit_block(c, after_block);
4366 if (was_uniform_control_flow)
4367 c->execute = c->undef;
4368 else
4369 ntq_activate_execute_for_block(c);
4370 }
4371
4372 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)4373 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
4374 {
4375 bool was_in_control_flow = c->in_control_flow;
4376 c->in_control_flow = true;
4377 if (!vir_in_nonuniform_control_flow(c) &&
4378 !nir_src_is_divergent(&nif->condition)) {
4379 ntq_emit_uniform_if(c, nif);
4380 } else {
4381 ntq_emit_nonuniform_if(c, nif);
4382 }
4383 c->in_control_flow = was_in_control_flow;
4384 }
4385
4386 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4387 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4388 {
4389 switch (jump->type) {
4390 case nir_jump_break:
4391 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4392 V3D_QPU_PF_PUSHZ);
4393 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4394 vir_uniform_ui(c, c->loop_break_block->index));
4395 break;
4396
4397 case nir_jump_continue:
4398 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4399 V3D_QPU_PF_PUSHZ);
4400 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4401 vir_uniform_ui(c, c->loop_cont_block->index));
4402 break;
4403
4404 case nir_jump_return:
4405 unreachable("All returns should be lowered\n");
4406 break;
4407
4408 case nir_jump_halt:
4409 case nir_jump_goto:
4410 case nir_jump_goto_if:
4411 unreachable("not supported\n");
4412 break;
4413 }
4414 }
4415
4416 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4417 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4418 {
4419 switch (jump->type) {
4420 case nir_jump_break:
4421 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4422 vir_link_blocks(c->cur_block, c->loop_break_block);
4423 c->cur_block->branch_emitted = true;
4424 break;
4425 case nir_jump_continue:
4426 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4427 vir_link_blocks(c->cur_block, c->loop_cont_block);
4428 c->cur_block->branch_emitted = true;
4429 break;
4430
4431 case nir_jump_return:
4432 unreachable("All returns should be lowered\n");
4433 break;
4434
4435 case nir_jump_halt:
4436 case nir_jump_goto:
4437 case nir_jump_goto_if:
4438 unreachable("not supported\n");
4439 break;
4440 }
4441 }
4442
4443 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4444 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4445 {
4446 switch (instr->type) {
4447 case nir_instr_type_alu:
4448 ntq_emit_alu(c, nir_instr_as_alu(instr));
4449 break;
4450
4451 case nir_instr_type_intrinsic:
4452 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4453 break;
4454
4455 case nir_instr_type_load_const:
4456 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4457 break;
4458
4459 case nir_instr_type_undef:
4460 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4461 break;
4462
4463 case nir_instr_type_tex:
4464 ntq_emit_tex(c, nir_instr_as_tex(instr));
4465 break;
4466
4467 case nir_instr_type_jump:
4468 /* Always flush TMU before jumping to another block, for the
4469 * same reasons as in ntq_emit_block.
4470 */
4471 ntq_flush_tmu(c);
4472 if (vir_in_nonuniform_control_flow(c))
4473 ntq_emit_jump(c, nir_instr_as_jump(instr));
4474 else
4475 ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4476 break;
4477
4478 default:
4479 fprintf(stderr, "Unknown NIR instr type: ");
4480 nir_print_instr(instr, stderr);
4481 fprintf(stderr, "\n");
4482 abort();
4483 }
4484 }
4485
4486 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4487 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4488 {
4489 nir_foreach_instr(instr, block) {
4490 ntq_emit_instr(c, instr);
4491 }
4492
4493 /* Always process pending TMU operations in the same block they were
4494 * emitted: we can't emit TMU operations in a block and then emit a
4495 * thread switch and LDTMU/TMUWT for them in another block, possibly
4496 * under control flow.
4497 */
4498 ntq_flush_tmu(c);
4499 }
4500
4501 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4502
4503 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4504 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4505 {
4506 bool was_uniform_control_flow = false;
4507 if (!vir_in_nonuniform_control_flow(c)) {
4508 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4509 was_uniform_control_flow = true;
4510 }
4511
4512 c->loop_cont_block = vir_new_block(c);
4513 c->loop_break_block = vir_new_block(c);
4514
4515 vir_link_blocks(c->cur_block, c->loop_cont_block);
4516 vir_set_emit_block(c, c->loop_cont_block);
4517 ntq_activate_execute_for_block(c);
4518
4519 ntq_emit_cf_list(c, &loop->body);
4520
4521 /* Re-enable any previous continues now, so our ANYA check below
4522 * works.
4523 *
4524 * XXX: Use the .ORZ flags update, instead.
4525 */
4526 vir_set_pf(c, vir_XOR_dest(c,
4527 vir_nop_reg(),
4528 c->execute,
4529 vir_uniform_ui(c, c->loop_cont_block->index)),
4530 V3D_QPU_PF_PUSHZ);
4531 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4532
4533 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4534
4535 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4536 /* Pixels that were not dispatched or have been discarded should not
4537 * contribute to looping again.
4538 */
4539 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4540 vir_link_blocks(c->cur_block, c->loop_cont_block);
4541 vir_link_blocks(c->cur_block, c->loop_break_block);
4542
4543 vir_set_emit_block(c, c->loop_break_block);
4544 if (was_uniform_control_flow)
4545 c->execute = c->undef;
4546 else
4547 ntq_activate_execute_for_block(c);
4548 }
4549
4550 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4551 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4552 {
4553 c->loop_cont_block = vir_new_block(c);
4554 c->loop_break_block = vir_new_block(c);
4555
4556 vir_link_blocks(c->cur_block, c->loop_cont_block);
4557 vir_set_emit_block(c, c->loop_cont_block);
4558
4559 ntq_emit_cf_list(c, &loop->body);
4560
4561 if (!c->cur_block->branch_emitted) {
4562 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4563 vir_link_blocks(c->cur_block, c->loop_cont_block);
4564 }
4565
4566 vir_set_emit_block(c, c->loop_break_block);
4567 }
4568
4569 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4570 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4571 {
4572 assert(!nir_loop_has_continue_construct(loop));
4573
4574 /* Disable flags optimization for loop conditions. The problem here is
4575 * that we can have code like this:
4576 *
4577 * // block_0
4578 * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
4579 * loop {
4580 * // block_1
4581 * if ssa_9 {
4582 *
4583 * In this example we emit flags to compute ssa_9 and the optimization
4584 * will skip regenerating them again for the loop condition in the
4585 * loop continue block (block_1). However, this is not safe after the
4586 * first iteration because the loop body can stomp the flags if it has
4587 * any conditionals.
4588 */
4589 c->flags_temp = -1;
4590
4591 bool was_in_control_flow = c->in_control_flow;
4592 c->in_control_flow = true;
4593
4594 struct qblock *save_loop_cont_block = c->loop_cont_block;
4595 struct qblock *save_loop_break_block = c->loop_break_block;
4596
4597 if (vir_in_nonuniform_control_flow(c) || nir_loop_is_divergent(loop)) {
4598 ntq_emit_nonuniform_loop(c, loop);
4599 } else {
4600 ntq_emit_uniform_loop(c, loop);
4601 }
4602
4603 c->loop_break_block = save_loop_break_block;
4604 c->loop_cont_block = save_loop_cont_block;
4605
4606 c->loops++;
4607
4608 c->in_control_flow = was_in_control_flow;
4609 }
4610
4611 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4612 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4613 {
4614 fprintf(stderr, "FUNCTIONS not handled.\n");
4615 abort();
4616 }
4617
4618 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4619 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4620 {
4621 foreach_list_typed(nir_cf_node, node, node, list) {
4622 switch (node->type) {
4623 case nir_cf_node_block:
4624 ntq_emit_block(c, nir_cf_node_as_block(node));
4625 break;
4626
4627 case nir_cf_node_if:
4628 ntq_emit_if(c, nir_cf_node_as_if(node));
4629 break;
4630
4631 case nir_cf_node_loop:
4632 ntq_emit_loop(c, nir_cf_node_as_loop(node));
4633 break;
4634
4635 case nir_cf_node_function:
4636 ntq_emit_function(c, nir_cf_node_as_function(node));
4637 break;
4638
4639 default:
4640 fprintf(stderr, "Unknown NIR node type\n");
4641 abort();
4642 }
4643 }
4644 }
4645
4646 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4647 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4648 {
4649 ntq_setup_registers(c, impl);
4650 ntq_emit_cf_list(c, &impl->body);
4651 }
4652
4653 static bool
vir_inst_reads_reg(struct qinst * inst,struct qreg r)4654 vir_inst_reads_reg(struct qinst *inst, struct qreg r)
4655 {
4656 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4657 if (inst->src[i].file == r.file && inst->src[i].index == r.index)
4658 return true;
4659 }
4660 return false;
4661 }
4662
4663 static void
sched_flags_in_block(struct v3d_compile * c,struct qblock * block)4664 sched_flags_in_block(struct v3d_compile *c, struct qblock *block)
4665 {
4666 struct qinst *flags_inst = NULL;
4667 list_for_each_entry_safe_rev(struct qinst, inst, &block->instructions, link) {
4668 /* Check for cases that would prevent us from moving a flags
4669 * instruction any earlier than this instruction:
4670 *
4671 * - The flags instruction reads the result of this instr.
4672 * - The instruction reads or writes flags.
4673 */
4674 if (flags_inst) {
4675 if (vir_inst_reads_reg(flags_inst, inst->dst) ||
4676 v3d_qpu_writes_flags(&inst->qpu) ||
4677 v3d_qpu_reads_flags(&inst->qpu)) {
4678 list_move_to(&flags_inst->link, &inst->link);
4679 flags_inst = NULL;
4680 }
4681 }
4682
4683 /* Skip if this instruction does more than just write flags */
4684 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
4685 inst->dst.file != QFILE_NULL ||
4686 !v3d_qpu_writes_flags(&inst->qpu)) {
4687 continue;
4688 }
4689
4690 /* If we already had a flags_inst we should've moved it after
4691 * this instruction in the if (flags_inst) above.
4692 */
4693 assert(!flags_inst);
4694 flags_inst = inst;
4695 }
4696
4697 /* If we reached the beginning of the block and we still have a flags
4698 * instruction selected we can put it at the top of the block.
4699 */
4700 if (flags_inst) {
4701 list_move_to(&flags_inst->link, &block->instructions);
4702 flags_inst = NULL;
4703 }
4704 }
4705
4706 /**
4707 * The purpose of this pass is to emit instructions that are only concerned
4708 * with producing flags as early as possible to hopefully reduce liveness
4709 * of their source arguments.
4710 */
4711 static void
sched_flags(struct v3d_compile * c)4712 sched_flags(struct v3d_compile *c)
4713 {
4714 vir_for_each_block(block, c)
4715 sched_flags_in_block(c, block);
4716 }
4717
4718 static void
nir_to_vir(struct v3d_compile * c)4719 nir_to_vir(struct v3d_compile *c)
4720 {
4721 switch (c->s->info.stage) {
4722 case MESA_SHADER_FRAGMENT:
4723 c->start_msf = vir_MSF(c);
4724 if (c->devinfo->ver < 71)
4725 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4726 else
4727 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
4728
4729 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4730 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4731
4732 /* V3D 4.x can disable implicit varyings if they are not used */
4733 c->fs_uses_primitive_id =
4734 nir_find_variable_with_location(c->s, nir_var_shader_in,
4735 VARYING_SLOT_PRIMITIVE_ID);
4736 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4737 c->primitive_id =
4738 emit_fragment_varying(c, NULL, -1, 0, 0);
4739 }
4740
4741 if (c->fs_key->is_points && program_reads_point_coord(c)) {
4742 c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4743 c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4744 c->uses_implicit_point_line_varyings = true;
4745 } else if (c->fs_key->is_lines &&
4746 (BITSET_TEST(c->s->info.system_values_read,
4747 SYSTEM_VALUE_LINE_COORD))) {
4748 c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4749 c->uses_implicit_point_line_varyings = true;
4750 }
4751 break;
4752 case MESA_SHADER_COMPUTE:
4753 /* Set up the TSO for barriers, assuming we do some. */
4754 if (c->devinfo->ver < 42) {
4755 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4756 V3D_QPU_WADDR_SYNC));
4757 }
4758
4759 if (c->devinfo->ver == 42) {
4760 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4761 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4762 } else if (c->devinfo->ver >= 71) {
4763 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
4764 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4765 }
4766
4767 /* Set up the division between gl_LocalInvocationIndex and
4768 * wg_in_mem in the payload reg.
4769 */
4770 int wg_size = (c->s->info.workgroup_size[0] *
4771 c->s->info.workgroup_size[1] *
4772 c->s->info.workgroup_size[2]);
4773 c->local_invocation_index_bits =
4774 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4775 assert(c->local_invocation_index_bits <= 8);
4776
4777 if (c->s->info.shared_size || c->s->info.cs.has_variable_shared_mem) {
4778 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4779 vir_uniform_ui(c, 16));
4780 if (c->s->info.workgroup_size[0] != 1 ||
4781 c->s->info.workgroup_size[1] != 1 ||
4782 c->s->info.workgroup_size[2] != 1) {
4783 int wg_bits = (16 -
4784 c->local_invocation_index_bits);
4785 int wg_mask = (1 << wg_bits) - 1;
4786 wg_in_mem = vir_AND(c, wg_in_mem,
4787 vir_uniform_ui(c, wg_mask));
4788 }
4789
4790 struct qreg shared_per_wg;
4791 if (c->s->info.cs.has_variable_shared_mem) {
4792 shared_per_wg = vir_uniform(c, QUNIFORM_SHARED_SIZE, 0);
4793 } else {
4794 shared_per_wg = vir_uniform_ui(c, c->s->info.shared_size);
4795 }
4796
4797 c->cs_shared_offset =
4798 vir_ADD(c,
4799 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4800 vir_UMUL(c, wg_in_mem, shared_per_wg));
4801 }
4802 break;
4803 default:
4804 break;
4805 }
4806
4807 if (c->s->scratch_size) {
4808 v3d_setup_spill_base(c);
4809 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4810 }
4811
4812 switch (c->s->info.stage) {
4813 case MESA_SHADER_VERTEX:
4814 ntq_setup_vs_inputs(c);
4815 break;
4816 case MESA_SHADER_GEOMETRY:
4817 ntq_setup_gs_inputs(c);
4818 break;
4819 case MESA_SHADER_FRAGMENT:
4820 ntq_setup_fs_inputs(c);
4821 break;
4822 case MESA_SHADER_COMPUTE:
4823 break;
4824 default:
4825 unreachable("unsupported shader stage");
4826 }
4827
4828 ntq_setup_outputs(c);
4829
4830 /* Find the main function and emit the body. */
4831 nir_foreach_function(function, c->s) {
4832 assert(function->is_entrypoint);
4833 assert(function->impl);
4834 ntq_emit_impl(c, function->impl);
4835 }
4836 }
4837
4838 /**
4839 * When demoting a shader down to single-threaded, removes the THRSW
4840 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4841 * program end).
4842 */
4843 static void
vir_remove_thrsw(struct v3d_compile * c)4844 vir_remove_thrsw(struct v3d_compile *c)
4845 {
4846 vir_for_each_block(block, c) {
4847 vir_for_each_inst_safe(inst, block) {
4848 if (inst->qpu.sig.thrsw)
4849 vir_remove_instruction(c, inst);
4850 }
4851 }
4852
4853 c->last_thrsw = NULL;
4854 }
4855
4856 /**
4857 * This makes sure we have a top-level last thread switch which signals the
4858 * start of the last thread section, which may include adding a new thrsw
4859 * instruction if needed. We don't allow spilling in the last thread section, so
4860 * if we need to do any spills that inject additional thread switches later on,
4861 * we ensure this thread switch will still be the last thread switch in the
4862 * program, which makes last thread switch signalling a lot easier when we have
4863 * spilling. If in the end we don't need to spill to compile the program and we
4864 * injected a new thread switch instruction here only for that, we will
4865 * eventually restore the previous last thread switch and remove the one we
4866 * added here.
4867 */
4868 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4869 vir_emit_last_thrsw(struct v3d_compile *c,
4870 struct qinst **restore_last_thrsw,
4871 bool *restore_scoreboard_lock)
4872 {
4873 *restore_last_thrsw = c->last_thrsw;
4874
4875 /* If we're threaded and the last THRSW was in conditional code, then
4876 * we need to emit another one so that we can flag it as the last
4877 * thrsw.
4878 */
4879 if (c->last_thrsw && !c->last_thrsw_at_top_level)
4880 vir_emit_thrsw(c);
4881
4882 /* If we're threaded, then we need to mark the last THRSW instruction
4883 * so we can emit a pair of them at QPU emit time.
4884 *
4885 * For V3D 4.x, we can spawn the non-fragment shaders already in the
4886 * post-last-THRSW state, so we can skip this.
4887 */
4888 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
4889 vir_emit_thrsw(c);
4890
4891 /* If we have not inserted a last thread switch yet, do it now to ensure
4892 * any potential spilling we do happens before this. If we don't spill
4893 * in the end, we will restore the previous one.
4894 */
4895 if (*restore_last_thrsw == c->last_thrsw) {
4896 if (*restore_last_thrsw)
4897 (*restore_last_thrsw)->is_last_thrsw = false;
4898 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4899 vir_emit_thrsw(c);
4900 } else {
4901 *restore_last_thrsw = c->last_thrsw;
4902 }
4903
4904 assert(c->last_thrsw);
4905 c->last_thrsw->is_last_thrsw = true;
4906 }
4907
4908 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4909 vir_restore_last_thrsw(struct v3d_compile *c,
4910 struct qinst *thrsw,
4911 bool scoreboard_lock)
4912 {
4913 assert(c->last_thrsw);
4914 vir_remove_instruction(c, c->last_thrsw);
4915 c->last_thrsw = thrsw;
4916 if (c->last_thrsw)
4917 c->last_thrsw->is_last_thrsw = true;
4918 c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4919 }
4920
4921 /* There's a flag in the shader for "center W is needed for reasons other than
4922 * non-centroid varyings", so we just walk the program after VIR optimization
4923 * to see if it's used. It should be harmless to set even if we only use
4924 * center W for varyings.
4925 */
4926 static void
vir_check_payload_w(struct v3d_compile * c)4927 vir_check_payload_w(struct v3d_compile *c)
4928 {
4929 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4930 return;
4931
4932 vir_for_each_inst_inorder(inst, c) {
4933 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4934 if (inst->src[i].file == c->payload_w.file &&
4935 inst->src[i].index == c->payload_w.index) {
4936 c->uses_center_w = true;
4937 return;
4938 }
4939 }
4940 }
4941 }
4942
4943 void
v3d_nir_to_vir(struct v3d_compile * c)4944 v3d_nir_to_vir(struct v3d_compile *c)
4945 {
4946 if (V3D_DBG(NIR) ||
4947 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4948 fprintf(stderr, "%s prog %d/%d NIR:\n",
4949 vir_get_stage_name(c),
4950 c->program_id, c->variant_id);
4951 nir_print_shader(c->s, stderr);
4952 }
4953
4954 nir_to_vir(c);
4955
4956 bool restore_scoreboard_lock = false;
4957 struct qinst *restore_last_thrsw;
4958
4959 /* Emit the last THRSW before STVPM and TLB writes. */
4960 vir_emit_last_thrsw(c,
4961 &restore_last_thrsw,
4962 &restore_scoreboard_lock);
4963
4964
4965 switch (c->s->info.stage) {
4966 case MESA_SHADER_FRAGMENT:
4967 emit_frag_end(c);
4968 break;
4969 case MESA_SHADER_GEOMETRY:
4970 emit_geom_end(c);
4971 break;
4972 case MESA_SHADER_VERTEX:
4973 emit_vert_end(c);
4974 break;
4975 case MESA_SHADER_COMPUTE:
4976 break;
4977 default:
4978 unreachable("bad stage");
4979 }
4980
4981 if (V3D_DBG(VIR) ||
4982 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4983 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4984 vir_get_stage_name(c),
4985 c->program_id, c->variant_id);
4986 vir_dump(c);
4987 fprintf(stderr, "\n");
4988 }
4989
4990 vir_optimize(c);
4991 sched_flags(c);
4992
4993 vir_check_payload_w(c);
4994
4995 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4996 * We used that on that platform to pipeline TMU writes and reduce the
4997 * number of thread switches, as well as try (mostly successfully) to
4998 * reduce maximum register pressure to allow more threads. We should
4999 * do something of that sort for V3D -- either instruction scheduling
5000 * here, or delay the the THRSW and LDTMUs from our texture
5001 * instructions until the results are needed.
5002 */
5003
5004 if (V3D_DBG(VIR) ||
5005 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
5006 fprintf(stderr, "%s prog %d/%d VIR:\n",
5007 vir_get_stage_name(c),
5008 c->program_id, c->variant_id);
5009 vir_dump(c);
5010 fprintf(stderr, "\n");
5011 }
5012
5013 /* Attempt to allocate registers for the temporaries. If we fail,
5014 * reduce thread count and try again.
5015 */
5016 int min_threads = 2;
5017 struct qpu_reg *temp_registers;
5018 while (true) {
5019 temp_registers = v3d_register_allocate(c);
5020 if (temp_registers) {
5021 assert(c->spills + c->fills <= c->max_tmu_spills);
5022 break;
5023 }
5024
5025 if (c->threads == min_threads &&
5026 V3D_DBG(RA)) {
5027 fprintf(stderr,
5028 "Failed to register allocate using %s\n",
5029 c->fallback_scheduler ? "the fallback scheduler:" :
5030 "the normal scheduler: \n");
5031
5032 vir_dump(c);
5033
5034 char *shaderdb;
5035 int ret = v3d_shaderdb_dump(c, &shaderdb);
5036 if (ret > 0) {
5037 fprintf(stderr, "%s\n", shaderdb);
5038 free(shaderdb);
5039 }
5040 }
5041
5042 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
5043 if (V3D_DBG(PERF)) {
5044 fprintf(stderr,
5045 "Failed to register allocate %s "
5046 "prog %d/%d at %d threads.\n",
5047 vir_get_stage_name(c),
5048 c->program_id, c->variant_id, c->threads);
5049 }
5050 c->compilation_result =
5051 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
5052 return;
5053 }
5054
5055 c->spills = 0;
5056 c->fills = 0;
5057 c->threads /= 2;
5058
5059 if (c->threads == 1)
5060 vir_remove_thrsw(c);
5061 }
5062
5063 /* If we didn't spill, then remove the last thread switch we injected
5064 * artificially (if any) and restore the previous one.
5065 */
5066 if (!c->spills && c->last_thrsw != restore_last_thrsw)
5067 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
5068
5069 if (c->spills &&
5070 (V3D_DBG(VIR) ||
5071 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
5072 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
5073 vir_get_stage_name(c),
5074 c->program_id, c->variant_id);
5075 vir_dump(c);
5076 fprintf(stderr, "\n");
5077 }
5078
5079 v3d_vir_to_qpu(c, temp_registers);
5080 }
5081