1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir/nir_to_tgsi.h"
29 #include "pipe/p_screen.h"
30 #include "pipe/p_state.h"
31 #include "tgsi/tgsi_dump.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_ureg.h"
36 #include "tgsi/tgsi_util.h"
37 #include "util/u_debug.h"
38 #include "util/u_math.h"
39 #include "util/u_memory.h"
40 #include "util/u_dynarray.h"
41
42 struct ntt_insn {
43 enum tgsi_opcode opcode;
44 struct ureg_dst dst[2];
45 struct ureg_src src[4];
46 enum tgsi_texture_type tex_target;
47 enum tgsi_return_type tex_return_type;
48 struct tgsi_texture_offset tex_offset[4];
49
50 unsigned mem_qualifier;
51 enum pipe_format mem_format;
52
53 bool is_tex : 1;
54 bool is_mem : 1;
55 bool precise : 1;
56 };
57
58 struct ntt_block {
59 /* Array of struct ntt_insn */
60 struct util_dynarray insns;
61 int start_ip;
62 int end_ip;
63 };
64
65 struct ntt_reg_interval {
66 uint32_t start, end;
67 };
68
69 struct ntt_compile {
70 nir_shader *s;
71 nir_function_impl *impl;
72 const struct nir_to_tgsi_options *options;
73 struct pipe_screen *screen;
74 struct ureg_program *ureg;
75
76 bool needs_texcoord_semantic;
77 bool native_integers;
78 bool has_txf_lz;
79
80 bool addr_declared[3];
81 struct ureg_dst addr_reg[3];
82
83 /* if condition set up at the end of a block, for ntt_emit_if(). */
84 struct ureg_src if_cond;
85
86 /* if condition set up at the end of a block, for ntt_emit_if(). */
87 struct ureg_src if_cond;
88
89 /* TGSI temps for our NIR SSA and register values. */
90 struct ureg_dst *reg_temp;
91 struct ureg_src *ssa_temp;
92
93 struct ntt_reg_interval *liveness;
94
95 /* Map from nir_block to ntt_block */
96 struct hash_table *blocks;
97 struct ntt_block *cur_block;
98 unsigned current_if_else;
99 unsigned cf_label;
100
101 /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
102 bool precise;
103
104 unsigned num_temps;
105 unsigned first_non_array_temp;
106
107 /* Mappings from driver_location to TGSI input/output number.
108 *
109 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
110 * their numbers assigned incrementally, unlike inputs or constants.
111 */
112 struct ureg_src *input_index_map;
113 uint64_t centroid_inputs;
114
115 uint32_t first_ubo;
116 uint32_t first_ssbo;
117
118 struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
119 };
120
121 static struct ureg_dst
ntt_temp(struct ntt_compile * c)122 ntt_temp(struct ntt_compile *c)
123 {
124 return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
125 }
126
127 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)128 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
129 {
130 struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
131 return entry->data;
132 }
133
134 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
135 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
136
137 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)138 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
139 struct ureg_dst dst,
140 struct ureg_src src0, struct ureg_src src1,
141 struct ureg_src src2, struct ureg_src src3)
142 {
143 struct ntt_insn insn = {
144 .opcode = opcode,
145 .dst = { dst, ureg_dst_undef() },
146 .src = { src0, src1, src2, src3 },
147 .precise = c->precise,
148 };
149 util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
150 return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
151 }
152
153 #define OP00( op ) \
154 static inline void ntt_##op(struct ntt_compile *c) \
155 { \
156 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
157 }
158
159 #define OP01( op ) \
160 static inline void ntt_##op(struct ntt_compile *c, \
161 struct ureg_src src0) \
162 { \
163 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
164 }
165
166
167 #define OP10( op ) \
168 static inline void ntt_##op(struct ntt_compile *c, \
169 struct ureg_dst dst) \
170 { \
171 ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
172 }
173
174 #define OP11( op ) \
175 static inline void ntt_##op(struct ntt_compile *c, \
176 struct ureg_dst dst, \
177 struct ureg_src src0) \
178 { \
179 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
180 }
181
182 #define OP12( op ) \
183 static inline void ntt_##op(struct ntt_compile *c, \
184 struct ureg_dst dst, \
185 struct ureg_src src0, \
186 struct ureg_src src1) \
187 { \
188 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
189 }
190
191 #define OP13( op ) \
192 static inline void ntt_##op(struct ntt_compile *c, \
193 struct ureg_dst dst, \
194 struct ureg_src src0, \
195 struct ureg_src src1, \
196 struct ureg_src src2) \
197 { \
198 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
199 }
200
201 #define OP14( op ) \
202 static inline void ntt_##op(struct ntt_compile *c, \
203 struct ureg_dst dst, \
204 struct ureg_src src0, \
205 struct ureg_src src1, \
206 struct ureg_src src2, \
207 struct ureg_src src3) \
208 { \
209 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
210 }
211
212 /* We hand-craft our tex instructions */
213 #define OP12_TEX(op)
214 #define OP14_TEX(op)
215
216 /* Use a template include to generate a correctly-typed ntt_OP()
217 * function for each TGSI opcode:
218 */
219 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
220
221 /**
222 * Interprets a nir_load_const used as a NIR src as a uint.
223 *
224 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
225 * instruction (or in a phi-web used by an integer ALU instruction) were
226 * converted to floats and the ALU instruction swapped to the float equivalent.
227 * However, this means that integer load_consts used by intrinsics (which don't
228 * normally get that conversion) may have been reformatted to be floats. Given
229 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
230 * we can just look and see if they look like floats and convert them back to
231 * ints.
232 */
233 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)234 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
235 {
236 uint32_t val = nir_src_as_uint(src);
237 if (!c->native_integers && val >= fui(1.0))
238 val = (uint32_t)uif(val);
239 return val;
240 }
241
242 static unsigned
ntt_64bit_write_mask(unsigned write_mask)243 ntt_64bit_write_mask(unsigned write_mask)
244 {
245 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
246 }
247
248 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)249 ntt_64bit_1f(struct ntt_compile *c)
250 {
251 return ureg_imm4u(c->ureg,
252 0x00000000, 0x3ff00000,
253 0x00000000, 0x3ff00000);
254 }
255
256 /* Per-channel masks of def/use within the block, and the per-channel
257 * livein/liveout for the block as a whole.
258 */
259 struct ntt_live_reg_block_state {
260 uint8_t *def, *use, *livein, *liveout, *defin, *defout;
261 };
262
263 struct ntt_live_reg_state {
264 unsigned bitset_words;
265
266 struct ntt_reg_interval *regs;
267
268 /* Used in propagate_across_edge() */
269 BITSET_WORD *tmp_live;
270
271 struct ntt_live_reg_block_state *blocks;
272
273 nir_block_worklist worklist;
274 };
275
276 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)277 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
278 int ip, unsigned index, unsigned used_mask)
279 {
280 bs->use[index] |= used_mask & ~bs->def[index];
281
282 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
283 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
284
285 }
286 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)287 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
288 {
289 for (int i = 0; i < impl->num_blocks; i++) {
290 state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291 state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
292 state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
293 state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
294 state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
295 state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
296 }
297
298 int ip = 0;
299 nir_foreach_block(block, impl) {
300 struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
301 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
302
303 ntt_block->start_ip = ip;
304
305 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
306 const struct tgsi_opcode_info *opcode_info =
307 tgsi_get_opcode_info(insn->opcode);
308
309 /* Set up use[] for the srcs.
310 *
311 * Uses are the channels of the reg read in the block that don't have a
312 * preceding def to screen them off. Note that we don't do per-element
313 * tracking of array regs, so they're never screened off.
314 */
315 for (int i = 0; i < opcode_info->num_src; i++) {
316 if (insn->src[i].File != TGSI_FILE_TEMPORARY)
317 continue;
318 int index = insn->src[i].Index;
319
320 uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
321 insn->dst->WriteMask,
322 insn->src[i].SwizzleX,
323 insn->src[i].SwizzleY,
324 insn->src[i].SwizzleZ,
325 insn->src[i].SwizzleW,
326 insn->tex_target,
327 insn->tex_target);
328
329 assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
330 ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
331 }
332
333 if (insn->is_tex) {
334 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
335 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
336 ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
337 }
338 }
339
340 /* Set up def[] for the srcs.
341 *
342 * Defs are the unconditionally-written (not R/M/W) channels of the reg in
343 * the block that don't have a preceding use.
344 */
345 for (int i = 0; i < opcode_info->num_dst; i++) {
346 if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
347 continue;
348 int index = insn->dst[i].Index;
349 uint32_t writemask = insn->dst[i].WriteMask;
350
351 bs->def[index] |= writemask & ~bs->use[index];
352 bs->defout[index] |= writemask;
353
354 assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
355 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
356 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
357 }
358 ip++;
359 }
360
361 ntt_block->end_ip = ip;
362 }
363 }
364
365 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)366 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
367 {
368 nir_metadata_require(impl, nir_metadata_block_index);
369
370 c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
371
372 struct ntt_live_reg_state state = {
373 .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
374 };
375
376 /* The intervals start out with start > end (indicating unused) */
377 for (int i = 0; i < c->num_temps; i++)
378 c->liveness[i].start = ~0;
379
380 ntt_live_reg_setup_def_use(c, impl, &state);
381
382 /* Make a forward-order worklist of all the blocks. */
383 nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
384 nir_foreach_block(block, impl) {
385 nir_block_worklist_push_tail(&state.worklist, block);
386 }
387
388 /* Propagate defin/defout down the CFG to calculate the live variables
389 * potentially defined along any possible control flow path. We'll use this
390 * to keep things like conditional defs of the reg (or array regs where we
391 * don't track defs!) from making the reg's live range extend back to the
392 * start of the program.
393 */
394 while (!nir_block_worklist_is_empty(&state.worklist)) {
395 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
396 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
397 nir_block *succ = block->successors[j];
398 if (!succ || succ->index == impl->num_blocks)
399 continue;
400
401 for (int i = 0; i < c->num_temps; i++) {
402 uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
403
404 if (new_def) {
405 state.blocks[succ->index].defin[i] |= new_def;
406 state.blocks[succ->index].defout[i] |= new_def;
407 nir_block_worklist_push_tail(&state.worklist, succ);
408 }
409 }
410 }
411 }
412
413 /* Make a reverse-order worklist of all the blocks. */
414 nir_foreach_block(block, impl) {
415 nir_block_worklist_push_head(&state.worklist, block);
416 }
417
418 /* We're now ready to work through the worklist and update the liveness sets
419 * of each of the blocks. As long as we keep the worklist up-to-date as we
420 * go, everything will get covered.
421 */
422 while (!nir_block_worklist_is_empty(&state.worklist)) {
423 /* We pop them off in the reverse order we pushed them on. This way
424 * the first walk of the instructions is backwards so we only walk
425 * once in the case of no control flow.
426 */
427 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
428 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
429 struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
430
431 for (int i = 0; i < c->num_temps; i++) {
432 /* Collect livein from our successors to include in our liveout. */
433 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
434 nir_block *succ = block->successors[j];
435 if (!succ || succ->index == impl->num_blocks)
436 continue;
437 struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
438
439 uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
440 if (new_liveout) {
441 if (state.blocks[block->index].defout[i])
442 c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
443 bs->liveout[i] |= sbs->livein[i];
444 }
445 }
446
447 /* Propagate use requests from either our block's uses or our
448 * non-screened-off liveout up to our predecessors.
449 */
450 uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
451 ~bs->livein[i]);
452 if (new_livein) {
453 bs->livein[i] |= new_livein;
454 set_foreach(block->predecessors, entry) {
455 nir_block *pred = (void *)entry->key;
456 nir_block_worklist_push_tail(&state.worklist, pred);
457 }
458
459 if (new_livein & state.blocks[block->index].defin[i])
460 c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
461 }
462 }
463 }
464
465 ralloc_free(state.blocks);
466 nir_block_worklist_fini(&state.worklist);
467 }
468
469 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)470 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
471 {
472 if (index < c->first_non_array_temp)
473 return;
474
475 if (c->liveness[index].start == ip && ra_map[index] == ~0)
476 ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
477
478 if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
479 ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
480 BITSET_SET(released, index);
481 }
482 }
483
484 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)485 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
486 {
487 ntt_live_regs(c, impl);
488
489 unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
490 unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
491
492 /* No RA on NIR array regs */
493 for (int i = 0; i < c->first_non_array_temp; i++)
494 ra_map[i] = i;
495
496 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
497 ra_map[i] = ~0;
498
499 int ip = 0;
500 nir_foreach_block(block, impl) {
501 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
502
503 for (int i = 0; i < c->num_temps; i++)
504 ntt_ra_check(c, ra_map, released, ip, i);
505
506 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
507 const struct tgsi_opcode_info *opcode_info =
508 tgsi_get_opcode_info(insn->opcode);
509
510 for (int i = 0; i < opcode_info->num_src; i++) {
511 if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
512 ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
513 insn->src[i].Index = ra_map[insn->src[i].Index];
514 }
515 }
516
517 if (insn->is_tex) {
518 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
519 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
520 ntt_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
521 insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
522 }
523 }
524 }
525
526 for (int i = 0; i < opcode_info->num_dst; i++) {
527 if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
528 ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
529 insn->dst[i].Index = ra_map[insn->dst[i].Index];
530 }
531 }
532 ip++;
533 }
534
535 for (int i = 0; i < c->num_temps; i++)
536 ntt_ra_check(c, ra_map, released, ip, i);
537 }
538 }
539
540 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)541 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
542 {
543 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
544 ureg_DECL_temporary(c->ureg);
545 }
546
547
548 /**
549 * Try to find an iadd of a constant value with a non-constant value in the
550 * nir_src's first component, returning the constant offset and replacing *src
551 * with the non-constant component.
552 */
553 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)554 ntt_extract_const_src_offset(nir_src *src)
555 {
556 nir_scalar s = nir_get_scalar(src->ssa, 0);
557
558 while (nir_scalar_is_alu(s)) {
559 nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
560
561 if (alu->op == nir_op_iadd) {
562 for (int i = 0; i < 2; i++) {
563 nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
564 if (v != NULL) {
565 *src = alu->src[1 - i].src;
566 return v[alu->src[i].swizzle[s.comp]].u32;
567 }
568 }
569
570 return 0;
571 }
572
573 /* We'd like to reuse nir_scalar_chase_movs(), but it assumes SSA and that
574 * seems reasonable for something used in inner loops of the compiler.
575 */
576 if (alu->op == nir_op_mov) {
577 s.def = alu->src[0].src.ssa;
578 s.comp = alu->src[0].swizzle[s.comp];
579 } else if (nir_op_is_vec(alu->op)) {
580 s.def = alu->src[s.comp].src.ssa;
581 s.comp = alu->src[s.comp].swizzle[0];
582 } else {
583 return 0;
584 }
585 }
586
587 return 0;
588 }
589
590 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)591 ntt_shader_input_type(struct ntt_compile *c,
592 struct nir_variable *var)
593 {
594 switch (c->s->info.stage) {
595 case MESA_SHADER_GEOMETRY:
596 case MESA_SHADER_TESS_EVAL:
597 case MESA_SHADER_TESS_CTRL:
598 if (glsl_type_is_array(var->type))
599 return glsl_get_array_element(var->type);
600 else
601 return var->type;
602 default:
603 return var->type;
604 }
605 }
606
607 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)608 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
609 unsigned *semantic_name, unsigned *semantic_index)
610 {
611 /* We want to use most of tgsi_get_gl_varying_semantic(), but the
612 * !texcoord shifting has already been applied, so avoid that.
613 */
614 if (!c->needs_texcoord_semantic &&
615 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
616 *semantic_name = TGSI_SEMANTIC_GENERIC;
617 *semantic_index = location - VARYING_SLOT_VAR0;
618 return;
619 }
620
621 tgsi_get_gl_varying_semantic(location, true,
622 semantic_name, semantic_index);
623 }
624
625 /* TGSI varying declarations have a component usage mask associated (used by
626 * r600 and svga).
627 */
628 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)629 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
630 bool is_64)
631 {
632 uint32_t usage_mask =
633 u_bit_consecutive(start_component, num_components);
634
635 if (is_64) {
636 if (start_component >= 2)
637 usage_mask >>= 2;
638
639 uint32_t tgsi_usage_mask = 0;
640
641 if (usage_mask & TGSI_WRITEMASK_X)
642 tgsi_usage_mask |= TGSI_WRITEMASK_XY;
643 if (usage_mask & TGSI_WRITEMASK_Y)
644 tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
645
646 return tgsi_usage_mask;
647 } else {
648 return usage_mask;
649 }
650 }
651
652 /* TGSI varying declarations have a component usage mask associated (used by
653 * r600 and svga).
654 */
655 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)656 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
657 {
658 const struct glsl_type *type_without_array =
659 glsl_without_array(var->type);
660 unsigned num_components = glsl_get_vector_elements(type_without_array);
661 if (num_components == 0) /* structs */
662 num_components = 4;
663
664 return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
665 glsl_type_is_64bit(type_without_array));
666 }
667
668 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)669 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
670 {
671 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
672 int base = nir_intrinsic_base(instr);
673 *frac = nir_intrinsic_component(instr);
674 bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
675
676 struct ureg_dst out;
677 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
678 unsigned semantic_name, semantic_index;
679 tgsi_get_gl_frag_result_semantic(semantics.location,
680 &semantic_name, &semantic_index);
681 semantic_index += semantics.dual_source_blend_index;
682
683 switch (semantics.location) {
684 case FRAG_RESULT_DEPTH:
685 *frac = 2; /* z write is the to the .z channel in TGSI */
686 break;
687 case FRAG_RESULT_STENCIL:
688 *frac = 1;
689 break;
690 default:
691 break;
692 }
693
694 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
695 } else {
696 unsigned semantic_name, semantic_index;
697
698 ntt_get_gl_varying_semantic(c, semantics.location,
699 &semantic_name, &semantic_index);
700
701 uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
702 instr->num_components,
703 is_64);
704 uint32_t gs_streams = semantics.gs_streams;
705 for (int i = 0; i < 4; i++) {
706 if (!(usage_mask & (1 << i)))
707 gs_streams &= ~(0x3 << 2 * i);
708 }
709
710 /* No driver appears to use array_id of outputs. */
711 unsigned array_id = 0;
712
713 /* This bit is lost in the i/o semantics, but it's unused in in-tree
714 * drivers.
715 */
716 bool invariant = semantics.invariant;
717
718 unsigned num_slots = semantics.num_slots;
719 if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
720 semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) {
721 /* Compact vars get a num_slots in NIR as number of components, but we
722 * want the number of vec4 slots here.
723 */
724 num_slots = 1;
725 }
726
727 out = ureg_DECL_output_layout(c->ureg,
728 semantic_name, semantic_index,
729 gs_streams,
730 base,
731 usage_mask,
732 array_id,
733 num_slots,
734 invariant);
735 }
736
737 unsigned write_mask;
738 if (nir_intrinsic_has_write_mask(instr))
739 write_mask = nir_intrinsic_write_mask(instr);
740 else
741 write_mask = ((1 << instr->num_components) - 1) << *frac;
742
743 if (is_64) {
744 write_mask = ntt_64bit_write_mask(write_mask);
745 if (*frac >= 2)
746 write_mask = write_mask << 2;
747 } else {
748 write_mask = write_mask << *frac;
749 }
750 return ureg_writemask(out, write_mask);
751 }
752
753 static bool
ntt_try_store_in_tgsi_output_with_use(struct ntt_compile * c,struct ureg_dst * dst,nir_src * src)754 ntt_try_store_in_tgsi_output_with_use(struct ntt_compile *c,
755 struct ureg_dst *dst,
756 nir_src *src)
757 {
758 *dst = ureg_dst_undef();
759
760 switch (c->s->info.stage) {
761 case MESA_SHADER_FRAGMENT:
762 case MESA_SHADER_VERTEX:
763 break;
764 default:
765 /* tgsi_exec (at least) requires that output stores happen per vertex
766 * emitted, you don't get to reuse a previous output value for the next
767 * vertex.
768 */
769 return false;
770 }
771
772 if (nir_src_is_if(src))
773 return false;
774
775 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
776 return false;
777
778 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
779 if (intr->intrinsic != nir_intrinsic_store_output ||
780 !nir_src_is_const(intr->src[1])) {
781 return false;
782 }
783
784 uint32_t frac;
785 *dst = ntt_output_decl(c, intr, &frac);
786 dst->Index += ntt_src_as_uint(c, intr->src[1]);
787
788 return frac == 0;
789 }
790
791 /* If this reg is used only for storing an output, then in the simple
792 * cases we can write directly to the TGSI output instead of having
793 * store_output emit its own MOV.
794 */
795 static bool
ntt_try_store_reg_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)796 ntt_try_store_reg_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
797 nir_intrinsic_instr *reg_decl)
798 {
799 assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
800
801 *dst = ureg_dst_undef();
802
803 /* Look for a single use for try_store_in_tgsi_output */
804 nir_src *use = NULL;
805 nir_foreach_reg_load(src, reg_decl) {
806 nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
807 nir_foreach_use_including_if(load_use, &load->def) {
808 /* We can only have one use */
809 if (use != NULL)
810 return false;
811
812 use = load_use;
813 }
814 }
815
816 if (use == NULL)
817 return false;
818
819 return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
820 }
821
822 /* If this SSA def is used only for storing an output, then in the simple
823 * cases we can write directly to the TGSI output instead of having
824 * store_output emit its own MOV.
825 */
826 static bool
ntt_try_store_ssa_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_def * def)827 ntt_try_store_ssa_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
828 nir_def *def)
829 {
830 *dst = ureg_dst_undef();
831
832 if (!list_is_singular(&def->uses))
833 return false;
834
835 nir_foreach_use_including_if(use, def) {
836 return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
837 }
838 unreachable("We have one use");
839 }
840
841 static void
ntt_setup_inputs(struct ntt_compile * c)842 ntt_setup_inputs(struct ntt_compile *c)
843 {
844 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
845 return;
846
847 unsigned num_inputs = 0;
848 int num_input_arrays = 0;
849
850 nir_foreach_shader_in_variable(var, c->s) {
851 const struct glsl_type *type = ntt_shader_input_type(c, var);
852 unsigned array_len =
853 glsl_count_attribute_slots(type, false);
854
855 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
856 }
857
858 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
859
860 nir_foreach_shader_in_variable(var, c->s) {
861 const struct glsl_type *type = ntt_shader_input_type(c, var);
862 unsigned array_len =
863 glsl_count_attribute_slots(type, false);
864
865 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
866 unsigned sample_loc;
867 struct ureg_src decl;
868
869 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
870 interpolation =
871 tgsi_get_interp_mode(var->data.interpolation,
872 var->data.location == VARYING_SLOT_COL0 ||
873 var->data.location == VARYING_SLOT_COL1);
874
875 if (var->data.location == VARYING_SLOT_POS)
876 interpolation = TGSI_INTERPOLATE_LINEAR;
877 }
878
879 unsigned semantic_name, semantic_index;
880 ntt_get_gl_varying_semantic(c, var->data.location,
881 &semantic_name, &semantic_index);
882
883 if (var->data.sample) {
884 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
885 } else if (var->data.centroid) {
886 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
887 c->centroid_inputs |= (BITSET_MASK(array_len) <<
888 var->data.driver_location);
889 } else {
890 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
891 }
892
893 unsigned array_id = 0;
894 if (glsl_type_is_array(type))
895 array_id = ++num_input_arrays;
896
897 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
898
899 decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
900 semantic_name,
901 semantic_index,
902 interpolation,
903 sample_loc,
904 var->data.driver_location,
905 usage_mask,
906 array_id, array_len);
907
908 if (semantic_name == TGSI_SEMANTIC_FACE) {
909 struct ureg_dst temp = ntt_temp(c);
910 if (c->native_integers) {
911 /* NIR is ~0 front and 0 back, while TGSI is +1 front */
912 ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
913 } else {
914 /* tgsi docs say that floating point FACE will be positive for
915 * frontface and negative for backface, but realistically
916 * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
917 * Copy that behavior, since some drivers (r300) have been doing a
918 * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
919 * front face).
920 */
921 temp.Saturate = true;
922 ntt_MOV(c, temp, decl);
923
924 }
925 decl = ureg_src(temp);
926 }
927
928 for (unsigned i = 0; i < array_len; i++) {
929 c->input_index_map[var->data.driver_location + i] = decl;
930 c->input_index_map[var->data.driver_location + i].Index += i;
931 }
932 }
933 }
934
935 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)936 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
937 {
938 return a->data.location - b->data.location;
939 }
940
941 /**
942 * Workaround for virglrenderer requiring that TGSI FS output color variables
943 * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
944 */
945 static void
ntt_setup_outputs(struct ntt_compile * c)946 ntt_setup_outputs(struct ntt_compile *c)
947 {
948 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
949 return;
950
951 nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
952
953 nir_foreach_shader_out_variable(var, c->s) {
954 if (var->data.location == FRAG_RESULT_COLOR)
955 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
956
957 unsigned semantic_name, semantic_index;
958 tgsi_get_gl_frag_result_semantic(var->data.location,
959 &semantic_name, &semantic_index);
960
961 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
962 }
963 }
964
965 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)966 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
967 {
968 switch (dim) {
969 case GLSL_SAMPLER_DIM_1D:
970 if (is_shadow)
971 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
972 else
973 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
974 case GLSL_SAMPLER_DIM_2D:
975 case GLSL_SAMPLER_DIM_EXTERNAL:
976 if (is_shadow)
977 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
978 else
979 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
980 case GLSL_SAMPLER_DIM_3D:
981 return TGSI_TEXTURE_3D;
982 case GLSL_SAMPLER_DIM_CUBE:
983 if (is_shadow)
984 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
985 else
986 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
987 case GLSL_SAMPLER_DIM_RECT:
988 if (is_shadow)
989 return TGSI_TEXTURE_SHADOWRECT;
990 else
991 return TGSI_TEXTURE_RECT;
992 case GLSL_SAMPLER_DIM_MS:
993 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
994 case GLSL_SAMPLER_DIM_BUF:
995 return TGSI_TEXTURE_BUFFER;
996 default:
997 unreachable("unknown sampler dim");
998 }
999 }
1000
1001 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)1002 tgsi_return_type_from_base_type(enum glsl_base_type type)
1003 {
1004 switch (type) {
1005 case GLSL_TYPE_INT:
1006 return TGSI_RETURN_TYPE_SINT;
1007 case GLSL_TYPE_UINT:
1008 return TGSI_RETURN_TYPE_UINT;
1009 case GLSL_TYPE_FLOAT:
1010 return TGSI_RETURN_TYPE_FLOAT;
1011 default:
1012 unreachable("unexpected texture type");
1013 }
1014 }
1015
1016 static void
ntt_setup_uniforms(struct ntt_compile * c)1017 ntt_setup_uniforms(struct ntt_compile *c)
1018 {
1019 nir_foreach_uniform_variable(var, c->s) {
1020 if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
1021 glsl_type_is_texture(glsl_without_array(var->type))) {
1022 /* Don't use this size for the check for samplers -- arrays of structs
1023 * containing samplers should be ignored, and just the separate lowered
1024 * sampler uniform decl used.
1025 */
1026 int size = glsl_type_get_sampler_count(var->type) +
1027 glsl_type_get_texture_count(var->type);
1028
1029 const struct glsl_type *stype = glsl_without_array(var->type);
1030 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
1031 glsl_sampler_type_is_array(stype),
1032 glsl_sampler_type_is_shadow(stype));
1033 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
1034 for (int i = 0; i < size; i++) {
1035 ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
1036 target, ret_type, ret_type, ret_type, ret_type);
1037 ureg_DECL_sampler(c->ureg, var->data.binding + i);
1038 }
1039 } else if (glsl_contains_atomic(var->type)) {
1040 uint32_t offset = var->data.offset / 4;
1041 uint32_t size = glsl_atomic_size(var->type) / 4;
1042 ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
1043 }
1044
1045 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
1046 * size declaration happens with other UBOs below.
1047 */
1048 }
1049
1050 nir_foreach_image_variable(var, c->s) {
1051 int image_count = glsl_type_get_image_count(var->type);
1052 const struct glsl_type *itype = glsl_without_array(var->type);
1053 enum tgsi_texture_type tex_type =
1054 tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
1055 glsl_sampler_type_is_array(itype), false);
1056
1057 for (int i = 0; i < image_count; i++) {
1058 c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1059 var->data.binding + i,
1060 tex_type,
1061 var->data.image.format,
1062 !(var->data.access & ACCESS_NON_WRITEABLE),
1063 false);
1064 }
1065 }
1066
1067 c->first_ubo = ~0;
1068
1069 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1070 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1071 int ubo = var->data.driver_location;
1072 if (ubo == -1)
1073 continue;
1074
1075 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1076 c->first_ubo = MIN2(c->first_ubo, ubo);
1077
1078 unsigned size = glsl_get_explicit_size(var->interface_type, false);
1079
1080 int array_size = 1;
1081 if (glsl_type_is_interface(glsl_without_array(var->type)))
1082 array_size = MAX2(1, glsl_get_aoa_size(var->type));
1083
1084 for (int i = 0; i < array_size; i++) {
1085 /* Even if multiple NIR variables are in the same uniform block, their
1086 * explicit size is the size of the block.
1087 */
1088 if (ubo_sizes[ubo + i])
1089 assert(ubo_sizes[ubo + i] == size);
1090
1091 ubo_sizes[ubo + i] = size;
1092 }
1093 }
1094
1095 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1096 if (ubo_sizes[i])
1097 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1098 }
1099
1100 if (c->options->lower_ssbo_bindings) {
1101 c->first_ssbo = 255;
1102 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
1103 if (c->first_ssbo > var->data.binding)
1104 c->first_ssbo = var->data.binding;
1105 }
1106 } else
1107 c->first_ssbo = 0;
1108
1109 /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1110 * counters
1111 */
1112 bool atomic = false;
1113 for (int i = 0; i < c->s->info.num_ssbos; ++i)
1114 ureg_DECL_buffer(c->ureg, c->first_ssbo + i, atomic);
1115
1116 }
1117
1118 static void
ntt_setup_registers(struct ntt_compile * c)1119 ntt_setup_registers(struct ntt_compile *c)
1120 {
1121 assert(c->num_temps == 0);
1122
1123 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1124 /* Permanently allocate all the array regs at the start. */
1125 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1126 unsigned index = nir_reg->def.index;
1127
1128 if (num_array_elems != 0) {
1129 struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
1130 c->reg_temp[index] = decl;
1131 assert(c->num_temps == decl.Index);
1132 c->num_temps += num_array_elems;
1133 }
1134 }
1135 c->first_non_array_temp = c->num_temps;
1136
1137 /* After that, allocate non-array regs in our virtual space that we'll
1138 * register-allocate before ureg emit.
1139 */
1140 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1141 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1142 unsigned num_components = nir_intrinsic_num_components(nir_reg);
1143 unsigned bit_size = nir_intrinsic_bit_size(nir_reg);
1144 unsigned index = nir_reg->def.index;
1145
1146 /* We already handled arrays */
1147 if (num_array_elems == 0) {
1148 struct ureg_dst decl;
1149 uint32_t write_mask = BITFIELD_MASK(num_components);
1150
1151 if (!ntt_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
1152 if (bit_size == 64) {
1153 if (num_components > 2) {
1154 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1155 num_components, index);
1156 }
1157
1158 write_mask = ntt_64bit_write_mask(write_mask);
1159 }
1160
1161 decl = ureg_writemask(ntt_temp(c), write_mask);
1162 }
1163 c->reg_temp[index] = decl;
1164 }
1165 }
1166 }
1167
1168 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1169 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1170 {
1171 int num_components = instr->def.num_components;
1172
1173 if (!c->native_integers) {
1174 float values[4];
1175 assert(instr->def.bit_size == 32);
1176 for (int i = 0; i < num_components; i++)
1177 values[i] = uif(instr->value[i].u32);
1178
1179 return ureg_DECL_immediate(c->ureg, values, num_components);
1180 } else {
1181 uint32_t values[4];
1182
1183 if (instr->def.bit_size == 32) {
1184 for (int i = 0; i < num_components; i++)
1185 values[i] = instr->value[i].u32;
1186 } else {
1187 assert(num_components <= 2);
1188 for (int i = 0; i < num_components; i++) {
1189 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1190 values[i * 2 + 1] = instr->value[i].u64 >> 32;
1191 }
1192 num_components *= 2;
1193 }
1194
1195 return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1196 }
1197 }
1198
1199 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1200 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1201 {
1202 assert(addr_index < ARRAY_SIZE(c->addr_reg));
1203
1204 for (int i = 0; i <= addr_index; i++) {
1205 if (!c->addr_declared[i]) {
1206 c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1207 TGSI_WRITEMASK_X);
1208 c->addr_declared[i] = true;
1209 }
1210 }
1211
1212 if (c->native_integers)
1213 ntt_UARL(c, c->addr_reg[addr_index], addr);
1214 else
1215 ntt_ARL(c, c->addr_reg[addr_index], addr);
1216 return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1217 }
1218
1219 /* Forward declare for recursion with indirects */
1220 static struct ureg_src
1221 ntt_get_src(struct ntt_compile *c, nir_src src);
1222
1223 static struct ureg_src
ntt_get_chased_src(struct ntt_compile * c,nir_legacy_src * src)1224 ntt_get_chased_src(struct ntt_compile *c, nir_legacy_src *src)
1225 {
1226 if (src->is_ssa) {
1227 if (src->ssa->parent_instr->type == nir_instr_type_load_const)
1228 return ntt_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
1229
1230 return c->ssa_temp[src->ssa->index];
1231 } else {
1232 struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
1233 reg_temp.Index += src->reg.base_offset;
1234
1235 if (src->reg.indirect) {
1236 struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(src->reg.indirect));
1237 return ureg_src_indirect(ureg_src(reg_temp),
1238 ntt_reladdr(c, offset, 0));
1239 } else {
1240 return ureg_src(reg_temp);
1241 }
1242 }
1243 }
1244
1245 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1246 ntt_get_src(struct ntt_compile *c, nir_src src)
1247 {
1248 nir_legacy_src chased = nir_legacy_chase_src(&src);
1249 return ntt_get_chased_src(c, &chased);
1250 }
1251
1252 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1253 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1254 {
1255 /* We only support 32-bit float modifiers. The only other modifier type
1256 * officially supported by TGSI is 32-bit integer negates, but even those are
1257 * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1258 *
1259 * The options->lower_fabs requests that we not have native source modifiers
1260 * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1261 */
1262 nir_legacy_alu_src src =
1263 nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1264 struct ureg_src usrc = ntt_get_chased_src(c, &src.src);
1265
1266 /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1267 * channels. We skip this for undefs, as those don't get split to vec2s (but
1268 * the specific swizzles from an undef don't matter)
1269 */
1270 if (nir_src_bit_size(instr->src[i].src) == 64 &&
1271 !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_undef)) {
1272 int chan1 = 1;
1273 if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1274 chan1 = instr->def.num_components > 1 ? 1 : 0;
1275 }
1276 usrc = ureg_swizzle(usrc,
1277 src.swizzle[0] * 2,
1278 src.swizzle[0] * 2 + 1,
1279 src.swizzle[chan1] * 2,
1280 src.swizzle[chan1] * 2 + 1);
1281 } else {
1282 usrc = ureg_swizzle(usrc,
1283 src.swizzle[0],
1284 src.swizzle[1],
1285 src.swizzle[2],
1286 src.swizzle[3]);
1287 }
1288
1289 if (src.fabs)
1290 usrc = ureg_abs(usrc);
1291 if (src.fneg)
1292 usrc = ureg_negate(usrc);
1293
1294 return usrc;
1295 }
1296
1297 /* Reswizzles a source so that the unset channels in the write mask still refer
1298 * to one of the channels present in the write mask.
1299 */
1300 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1301 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1302 {
1303 assert(write_mask);
1304 int first_chan = ffs(write_mask) - 1;
1305 return ureg_swizzle(src,
1306 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1307 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1308 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1309 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1310 }
1311
1312 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_def * ssa)1313 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_def *ssa)
1314 {
1315 uint32_t writemask = BITSET_MASK(ssa->num_components);
1316 if (ssa->bit_size == 64)
1317 writemask = ntt_64bit_write_mask(writemask);
1318
1319 struct ureg_dst dst;
1320 if (!ntt_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1321 dst = ntt_temp(c);
1322
1323 c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1324
1325 return ureg_writemask(dst, writemask);
1326 }
1327
1328 static struct ureg_dst
ntt_get_chased_dest_decl(struct ntt_compile * c,nir_legacy_dest * dest)1329 ntt_get_chased_dest_decl(struct ntt_compile *c, nir_legacy_dest *dest)
1330 {
1331 if (dest->is_ssa)
1332 return ntt_get_ssa_def_decl(c, dest->ssa);
1333 else
1334 return c->reg_temp[dest->reg.handle->index];
1335 }
1336
1337 static struct ureg_dst
ntt_get_chased_dest(struct ntt_compile * c,nir_legacy_dest * dest)1338 ntt_get_chased_dest(struct ntt_compile *c, nir_legacy_dest *dest)
1339 {
1340 struct ureg_dst dst = ntt_get_chased_dest_decl(c, dest);
1341
1342 if (!dest->is_ssa) {
1343 dst.Index += dest->reg.base_offset;
1344
1345 if (dest->reg.indirect) {
1346 struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1347 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1348 }
1349 }
1350
1351 return dst;
1352 }
1353
1354 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_def * def)1355 ntt_get_dest(struct ntt_compile *c, nir_def *def)
1356 {
1357 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1358 return ntt_get_chased_dest(c, &chased);
1359 }
1360
1361 static struct ureg_dst
ntt_get_alu_dest(struct ntt_compile * c,nir_def * def)1362 ntt_get_alu_dest(struct ntt_compile *c, nir_def *def)
1363 {
1364 nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1365 struct ureg_dst dst = ntt_get_chased_dest(c, &chased.dest);
1366
1367 if (chased.fsat)
1368 dst.Saturate = true;
1369
1370 /* Only registers get write masks */
1371 if (chased.dest.is_ssa)
1372 return dst;
1373
1374 int dst_64 = def->bit_size == 64;
1375 unsigned write_mask = chased.write_mask;
1376
1377 if (dst_64)
1378 return ureg_writemask(dst, ntt_64bit_write_mask(write_mask));
1379 else
1380 return ureg_writemask(dst, write_mask);
1381 }
1382
1383 /* For an SSA dest being populated by a constant src, replace the storage with
1384 * a copy of the ureg_src.
1385 */
1386 static void
ntt_store_def(struct ntt_compile * c,nir_def * def,struct ureg_src src)1387 ntt_store_def(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1388 {
1389 if (!src.Indirect && !src.DimIndirect) {
1390 switch (src.File) {
1391 case TGSI_FILE_IMMEDIATE:
1392 case TGSI_FILE_INPUT:
1393 case TGSI_FILE_CONSTANT:
1394 case TGSI_FILE_SYSTEM_VALUE:
1395 c->ssa_temp[def->index] = src;
1396 return;
1397 }
1398 }
1399
1400 ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1401 }
1402
1403 static void
ntt_store(struct ntt_compile * c,nir_def * def,struct ureg_src src)1404 ntt_store(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1405 {
1406 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1407
1408 if (chased.is_ssa)
1409 ntt_store_def(c, chased.ssa, src);
1410 else {
1411 struct ureg_dst dst = ntt_get_chased_dest(c, &chased);
1412 ntt_MOV(c, dst, src);
1413 }
1414 }
1415
1416 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1417 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1418 struct ureg_dst dst,
1419 struct ureg_src src0,
1420 struct ureg_src src1)
1421 {
1422 unsigned i;
1423
1424 /* POW is the only 2-operand scalar op. */
1425 if (tgsi_op != TGSI_OPCODE_POW)
1426 src1 = src0;
1427
1428 for (i = 0; i < 4; i++) {
1429 if (dst.WriteMask & (1 << i)) {
1430 ntt_insn(c, tgsi_op,
1431 ureg_writemask(dst, 1 << i),
1432 ureg_scalar(src0, i),
1433 ureg_scalar(src1, i),
1434 ureg_src_undef(), ureg_src_undef());
1435 }
1436 }
1437 }
1438
1439 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1440 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1441 {
1442 struct ureg_src src[4];
1443 struct ureg_dst dst;
1444 unsigned i;
1445 int dst_64 = instr->def.bit_size == 64;
1446 int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1447 int num_srcs = nir_op_infos[instr->op].num_inputs;
1448
1449 /* Don't try to translate folded fsat since their source won't be valid */
1450 if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1451 return;
1452
1453 c->precise = instr->exact;
1454
1455 assert(num_srcs <= ARRAY_SIZE(src));
1456 for (i = 0; i < num_srcs; i++)
1457 src[i] = ntt_get_alu_src(c, instr, i);
1458 for (; i < ARRAY_SIZE(src); i++)
1459 src[i] = ureg_src_undef();
1460
1461 dst = ntt_get_alu_dest(c, &instr->def);
1462
1463 static enum tgsi_opcode op_map[][2] = {
1464 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1465
1466 /* fabs/fneg 32-bit are special-cased below. */
1467 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1468 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1469
1470 [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1471 [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1472 [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1473 [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1474 [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1475 [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1476 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1477 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1478 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1479 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1480 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1481 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1482 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1483
1484 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1485 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1486 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1487
1488 /* The conversions will have one combination of src and dst bitsize. */
1489 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1490 [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1491 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1492
1493 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1494 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1495 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1496 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1497 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1498 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1499 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1500 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1501
1502 [nir_op_slt] = { TGSI_OPCODE_SLT },
1503 [nir_op_sge] = { TGSI_OPCODE_SGE },
1504 [nir_op_seq] = { TGSI_OPCODE_SEQ },
1505 [nir_op_sne] = { TGSI_OPCODE_SNE },
1506
1507 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1508 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1509 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1510 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1511
1512 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1513 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1514 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1515 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1516
1517 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1518 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1519
1520 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1521 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1522 [nir_op_fsign] = { TGSI_OPCODE_SSG, TGSI_OPCODE_DSSG },
1523 [nir_op_isign] = { TGSI_OPCODE_ISSG, TGSI_OPCODE_I64SSG },
1524 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1525 [nir_op_fddx] = { TGSI_OPCODE_DDX },
1526 [nir_op_fddy] = { TGSI_OPCODE_DDY },
1527 [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1528 [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1529 [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1530 [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1531 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1532 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1533 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1534 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1535 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1536 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1537 [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1538 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1539 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1540 [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1541 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1542 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1543 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1544 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1545 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1546 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1547 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1548 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1549 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1550 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1551 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1552
1553 /* These bitwise ops don't care about 32 vs 64 types, so they have the
1554 * same TGSI op.
1555 */
1556 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1557 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1558 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1559 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1560
1561 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1562 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1563 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1564 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1565 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1566 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1567 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1568 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1569 };
1570
1571 if (src_64 && !dst_64) {
1572 if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1573 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1574 * of .xy.
1575 */
1576 assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1577 } else {
1578 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1579 * channels and will need to get fixed up.
1580 */
1581 assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1582 }
1583 }
1584
1585 bool table_op64 = src_64;
1586 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1587 /* The normal path for NIR to TGSI ALU op translation */
1588 ntt_insn(c, op_map[instr->op][table_op64],
1589 dst, src[0], src[1], src[2], src[3]);
1590 } else {
1591 /* Special cases for NIR to TGSI ALU op translation. */
1592
1593 /* TODO: Use something like the ntt_store() path for the MOV calls so we
1594 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1595 */
1596
1597 switch (instr->op) {
1598 case nir_op_u2u64:
1599 ntt_AND(c, dst, ureg_swizzle(src[0],
1600 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1601 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1602 ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1603 break;
1604
1605 case nir_op_i2i32:
1606 case nir_op_u2u32:
1607 assert(src_64);
1608 ntt_MOV(c, dst, ureg_swizzle(src[0],
1609 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1610 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1611 break;
1612
1613 case nir_op_fabs:
1614 /* Try to eliminate */
1615 if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1616 break;
1617
1618 if (c->options->lower_fabs)
1619 ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1620 else
1621 ntt_MOV(c, dst, ureg_abs(src[0]));
1622 break;
1623
1624 case nir_op_fsat:
1625 if (dst_64) {
1626 ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1627 ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1628 } else {
1629 ntt_MOV(c, ureg_saturate(dst), src[0]);
1630 }
1631 break;
1632
1633 case nir_op_fneg:
1634 /* Try to eliminate */
1635 if (nir_legacy_float_mod_folds(instr))
1636 break;
1637
1638 ntt_MOV(c, dst, ureg_negate(src[0]));
1639 break;
1640
1641 /* NOTE: TGSI 32-bit math ops have the old "one source channel
1642 * replicated to all dst channels" behavior, while 64 is normal mapping
1643 * of src channels to dst.
1644 */
1645 case nir_op_frcp:
1646 assert(!dst_64);
1647 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1648 break;
1649
1650 case nir_op_frsq:
1651 assert(!dst_64);
1652 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1653 break;
1654
1655 case nir_op_fsqrt:
1656 assert(!dst_64);
1657 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1658 break;
1659
1660 case nir_op_fexp2:
1661 assert(!dst_64);
1662 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1663 break;
1664
1665 case nir_op_flog2:
1666 assert(!dst_64);
1667 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1668 break;
1669
1670 case nir_op_b2f32:
1671 ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1672 break;
1673
1674 case nir_op_b2f64:
1675 ntt_AND(c, dst,
1676 ureg_swizzle(src[0],
1677 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1678 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1679 ntt_64bit_1f(c));
1680 break;
1681
1682 case nir_op_b2i32:
1683 ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1684 break;
1685
1686 case nir_op_b2i64:
1687 ntt_AND(c, dst,
1688 ureg_swizzle(src[0],
1689 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1690 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1691 ureg_imm4u(c->ureg, 1, 0, 1, 0));
1692 break;
1693
1694 case nir_op_fsin:
1695 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1696 break;
1697
1698 case nir_op_fcos:
1699 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1700 break;
1701
1702 case nir_op_fsub:
1703 assert(!dst_64);
1704 ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1705 break;
1706
1707 case nir_op_isub:
1708 assert(!dst_64);
1709 ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1710 break;
1711
1712 case nir_op_fmod:
1713 unreachable("should be handled by .lower_fmod = true");
1714 break;
1715
1716 case nir_op_fpow:
1717 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1718 break;
1719
1720 case nir_op_flrp:
1721 ntt_LRP(c, dst, src[2], src[1], src[0]);
1722 break;
1723
1724 case nir_op_pack_64_2x32_split:
1725 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1726 ureg_swizzle(src[0],
1727 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1728 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1729 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1730 ureg_swizzle(src[1],
1731 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1732 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1733 break;
1734
1735 case nir_op_unpack_64_2x32_split_x:
1736 ntt_MOV(c, dst, ureg_swizzle(src[0],
1737 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1738 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1739 break;
1740
1741 case nir_op_unpack_64_2x32_split_y:
1742 ntt_MOV(c, dst, ureg_swizzle(src[0],
1743 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1744 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1745 break;
1746
1747 case nir_op_b32csel:
1748 if (nir_src_bit_size(instr->src[1].src) == 64) {
1749 ntt_UCMP(c, dst, ureg_swizzle(src[0],
1750 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1751 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1752 src[1], src[2]);
1753 } else {
1754 ntt_UCMP(c, dst, src[0], src[1], src[2]);
1755 }
1756 break;
1757
1758 case nir_op_fcsel:
1759 /* If CMP isn't supported, then the flags that enable NIR to generate
1760 * this opcode should also not be set.
1761 */
1762 assert(!c->options->lower_cmp);
1763
1764 /* Implement this as CMP(-abs(src0), src1, src2). */
1765 ntt_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1766 break;
1767
1768 case nir_op_fcsel_gt:
1769 /* If CMP isn't supported, then the flags that enable NIR to generate
1770 * these opcodes should also not be set.
1771 */
1772 assert(!c->options->lower_cmp);
1773
1774 ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1775 break;
1776
1777 case nir_op_fcsel_ge:
1778 /* If CMP isn't supported, then the flags that enable NIR to generate
1779 * these opcodes should also not be set.
1780 */
1781 assert(!c->options->lower_cmp);
1782
1783 /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1784 ntt_CMP(c, dst, src[0], src[2], src[1]);
1785 break;
1786
1787 case nir_op_frexp_sig:
1788 case nir_op_frexp_exp:
1789 unreachable("covered by nir_lower_frexp()");
1790 break;
1791
1792 case nir_op_ldexp:
1793 assert(dst_64); /* 32bit handled in table. */
1794 ntt_DLDEXP(c, dst, src[0],
1795 ureg_swizzle(src[1],
1796 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1797 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1798 break;
1799
1800 case nir_op_vec4:
1801 case nir_op_vec3:
1802 case nir_op_vec2:
1803 unreachable("covered by nir_lower_vec_to_movs()");
1804
1805 default:
1806 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1807 unreachable("Unknown NIR opcode");
1808 }
1809 }
1810
1811 c->precise = false;
1812 }
1813
1814 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1815 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1816 nir_src src, int addr_reg)
1817 {
1818 if (nir_src_is_const(src)) {
1819 usrc.Index += ntt_src_as_uint(c, src);
1820 return usrc;
1821 } else {
1822 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1823 }
1824 }
1825
1826 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1827 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1828 nir_src src)
1829 {
1830 if (nir_src_is_const(src)) {
1831 dst.Index += ntt_src_as_uint(c, src);
1832 return dst;
1833 } else {
1834 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1835 }
1836 }
1837
1838 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1839 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1840 nir_src src)
1841 {
1842 if (nir_src_is_const(src)) {
1843 return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1844 }
1845 else
1846 {
1847 return ureg_src_dimension_indirect(usrc,
1848 ntt_reladdr(c, ntt_get_src(c, src), 1),
1849 0);
1850 }
1851 }
1852
1853 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1854 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1855 nir_src src)
1856 {
1857 if (nir_src_is_const(src)) {
1858 return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1859 } else {
1860 return ureg_dst_dimension_indirect(udst,
1861 ntt_reladdr(c, ntt_get_src(c, src), 1),
1862 0);
1863 }
1864 }
1865 /* Some load operations in NIR will have a fractional offset that we need to
1866 * swizzle down before storing to the result register.
1867 */
1868 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1869 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1870 {
1871 return ureg_swizzle(src,
1872 frac,
1873 frac + MIN2(num_components - 1, 1),
1874 frac + MIN2(num_components - 1, 2),
1875 frac + MIN2(num_components - 1, 3));
1876 }
1877
1878
1879 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1880 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1881 {
1882 int bit_size = instr->def.bit_size;
1883 assert(bit_size == 32 || instr->num_components <= 2);
1884
1885 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1886
1887 struct ureg_dst addr_temp = ureg_dst_undef();
1888
1889 if (nir_src_is_const(instr->src[0])) {
1890 src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1891 } else {
1892 /* virglrenderer requires that indirect UBO references have the UBO
1893 * array's base index in the Index field, not added to the indrect
1894 * address.
1895 *
1896 * Many nir intrinsics have a base address const value for the start of
1897 * their array indirection, but load_ubo doesn't. We fake it by
1898 * subtracting it off here.
1899 */
1900 addr_temp = ntt_temp(c);
1901 ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1902 src = ureg_src_dimension_indirect(src,
1903 ntt_reladdr(c, ureg_src(addr_temp), 1),
1904 c->first_ubo);
1905 }
1906
1907 if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1908 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1909 * file.
1910 */
1911 src.Index = nir_intrinsic_base(instr);
1912
1913 if (nir_src_is_const(instr->src[1])) {
1914 src.Index += ntt_src_as_uint(c, instr->src[1]);
1915 } else {
1916 src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1917 }
1918
1919 int start_component = nir_intrinsic_component(instr);
1920 if (bit_size == 64)
1921 start_component *= 2;
1922
1923 src = ntt_shift_by_frac(src, start_component,
1924 instr->num_components * bit_size / 32);
1925
1926 ntt_store(c, &instr->def, src);
1927 } else {
1928 /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1929 * TGSI_OPCODE_LOAD instruction from the const file.
1930 */
1931 struct ntt_insn *insn =
1932 ntt_insn(c, TGSI_OPCODE_LOAD,
1933 ntt_get_dest(c, &instr->def),
1934 src, ntt_get_src(c, instr->src[1]),
1935 ureg_src_undef(), ureg_src_undef());
1936 insn->is_mem = true;
1937 insn->tex_target = 0;
1938 insn->mem_qualifier = 0;
1939 insn->mem_format = 0; /* unused */
1940 }
1941 }
1942
1943 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1944 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1945 {
1946 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1947 unsigned qualifier = 0;
1948
1949 if (access & ACCESS_COHERENT)
1950 qualifier |= TGSI_MEMORY_COHERENT;
1951 if (access & ACCESS_VOLATILE)
1952 qualifier |= TGSI_MEMORY_VOLATILE;
1953 if (access & ACCESS_RESTRICT)
1954 qualifier |= TGSI_MEMORY_RESTRICT;
1955
1956 return qualifier;
1957 }
1958
1959 static unsigned
ntt_translate_atomic_op(nir_atomic_op op)1960 ntt_translate_atomic_op(nir_atomic_op op)
1961 {
1962 switch (op) {
1963 case nir_atomic_op_iadd: return TGSI_OPCODE_ATOMUADD;
1964 case nir_atomic_op_fadd: return TGSI_OPCODE_ATOMFADD;
1965 case nir_atomic_op_imin: return TGSI_OPCODE_ATOMIMIN;
1966 case nir_atomic_op_imax: return TGSI_OPCODE_ATOMIMAX;
1967 case nir_atomic_op_umin: return TGSI_OPCODE_ATOMUMIN;
1968 case nir_atomic_op_umax: return TGSI_OPCODE_ATOMUMAX;
1969 case nir_atomic_op_iand: return TGSI_OPCODE_ATOMAND;
1970 case nir_atomic_op_ixor: return TGSI_OPCODE_ATOMXOR;
1971 case nir_atomic_op_ior: return TGSI_OPCODE_ATOMOR;
1972 case nir_atomic_op_xchg: return TGSI_OPCODE_ATOMXCHG;
1973 default: unreachable("invalid atomic");
1974 }
1975 }
1976
1977 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1978 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1979 nir_variable_mode mode)
1980 {
1981 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1982 instr->intrinsic == nir_intrinsic_store_shared);
1983 bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1984 instr->intrinsic == nir_intrinsic_load_ssbo ||
1985 instr->intrinsic == nir_intrinsic_load_shared);
1986 unsigned opcode;
1987 struct ureg_src src[4];
1988 int num_src = 0;
1989 int next_src;
1990 struct ureg_dst addr_temp = ureg_dst_undef();
1991
1992 struct ureg_src memory;
1993 switch (mode) {
1994 case nir_var_mem_ssbo:
1995 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER,
1996 c->first_ssbo),
1997 instr->src[is_store ? 1 : 0], 2);
1998 next_src = 1;
1999 break;
2000 case nir_var_mem_shared:
2001 memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
2002 next_src = 0;
2003 break;
2004 case nir_var_uniform: { /* HW atomic buffers */
2005 nir_src src = instr->src[0];
2006 uint32_t offset = (ntt_extract_const_src_offset(&src) +
2007 nir_intrinsic_range_base(instr)) / 4;
2008
2009 memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
2010 /* ntt_ureg_src_indirect, except dividing by 4 */
2011 if (nir_src_is_const(src)) {
2012 memory.Index += nir_src_as_uint(src) / 4;
2013 } else {
2014 addr_temp = ntt_temp(c);
2015 ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
2016 memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
2017 }
2018 memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
2019 next_src = 0;
2020 break;
2021 }
2022
2023 default:
2024 unreachable("unknown memory type");
2025 }
2026
2027 if (is_store) {
2028 src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
2029 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
2030 } else {
2031 src[num_src++] = memory;
2032 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2033 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
2034 switch (instr->intrinsic) {
2035 case nir_intrinsic_atomic_counter_inc:
2036 src[num_src++] = ureg_imm1i(c->ureg, 1);
2037 break;
2038 case nir_intrinsic_atomic_counter_post_dec:
2039 src[num_src++] = ureg_imm1i(c->ureg, -1);
2040 break;
2041 default:
2042 if (!is_load)
2043 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
2044 break;
2045 }
2046 }
2047 }
2048
2049
2050 switch (instr->intrinsic) {
2051 case nir_intrinsic_ssbo_atomic:
2052 case nir_intrinsic_shared_atomic:
2053 opcode = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2054 break;
2055 case nir_intrinsic_atomic_counter_add:
2056 case nir_intrinsic_atomic_counter_inc:
2057 case nir_intrinsic_atomic_counter_post_dec:
2058 opcode = TGSI_OPCODE_ATOMUADD;
2059 break;
2060 case nir_intrinsic_atomic_counter_min:
2061 opcode = TGSI_OPCODE_ATOMIMIN;
2062 break;
2063 case nir_intrinsic_atomic_counter_max:
2064 opcode = TGSI_OPCODE_ATOMIMAX;
2065 break;
2066 case nir_intrinsic_atomic_counter_and:
2067 opcode = TGSI_OPCODE_ATOMAND;
2068 break;
2069 case nir_intrinsic_atomic_counter_or:
2070 opcode = TGSI_OPCODE_ATOMOR;
2071 break;
2072 case nir_intrinsic_atomic_counter_xor:
2073 opcode = TGSI_OPCODE_ATOMXOR;
2074 break;
2075 case nir_intrinsic_atomic_counter_exchange:
2076 opcode = TGSI_OPCODE_ATOMXCHG;
2077 break;
2078 case nir_intrinsic_atomic_counter_comp_swap:
2079 case nir_intrinsic_ssbo_atomic_swap:
2080 case nir_intrinsic_shared_atomic_swap:
2081 opcode = TGSI_OPCODE_ATOMCAS;
2082 src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
2083 break;
2084 case nir_intrinsic_atomic_counter_read:
2085 case nir_intrinsic_load_ssbo:
2086 case nir_intrinsic_load_shared:
2087 opcode = TGSI_OPCODE_LOAD;
2088 break;
2089 case nir_intrinsic_store_ssbo:
2090 case nir_intrinsic_store_shared:
2091 opcode = TGSI_OPCODE_STORE;
2092 break;
2093 case nir_intrinsic_get_ssbo_size:
2094 opcode = TGSI_OPCODE_RESQ;
2095 break;
2096 default:
2097 unreachable("unknown memory op");
2098 }
2099
2100 unsigned qualifier = 0;
2101 if (mode == nir_var_mem_ssbo &&
2102 instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2103 qualifier = ntt_get_access_qualifier(instr);
2104 }
2105
2106 struct ureg_dst dst;
2107 if (is_store) {
2108 dst = ureg_dst(memory);
2109
2110 unsigned write_mask = nir_intrinsic_write_mask(instr);
2111 if (nir_src_bit_size(instr->src[0]) == 64)
2112 write_mask = ntt_64bit_write_mask(write_mask);
2113 dst = ureg_writemask(dst, write_mask);
2114 } else {
2115 dst = ntt_get_dest(c, &instr->def);
2116 }
2117
2118 struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2119 insn->tex_target = TGSI_TEXTURE_BUFFER;
2120 insn->mem_qualifier = qualifier;
2121 insn->mem_format = 0; /* unused */
2122 insn->is_mem = true;
2123 }
2124
2125 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2126 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2127 {
2128 unsigned op;
2129 struct ureg_src srcs[4];
2130 int num_src = 0;
2131 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2132 bool is_array = nir_intrinsic_image_array(instr);
2133
2134 struct ureg_dst temp = ureg_dst_undef();
2135
2136 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2137
2138 struct ureg_src resource;
2139 switch (instr->intrinsic) {
2140 case nir_intrinsic_bindless_image_load:
2141 case nir_intrinsic_bindless_image_store:
2142 case nir_intrinsic_bindless_image_size:
2143 case nir_intrinsic_bindless_image_samples:
2144 case nir_intrinsic_bindless_image_atomic:
2145 case nir_intrinsic_bindless_image_atomic_swap:
2146 resource = ntt_get_src(c, instr->src[0]);
2147 break;
2148 default:
2149 resource = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2150 instr->src[0], 2);
2151 resource.Index += nir_intrinsic_range_base(instr);
2152 }
2153
2154 struct ureg_dst dst;
2155 if (instr->intrinsic == nir_intrinsic_image_store ||
2156 instr->intrinsic == nir_intrinsic_bindless_image_store) {
2157 dst = ureg_dst(resource);
2158 } else {
2159 srcs[num_src++] = resource;
2160 dst = ntt_get_dest(c, &instr->def);
2161 }
2162 struct ureg_dst opcode_dst = dst;
2163
2164 if (instr->intrinsic != nir_intrinsic_image_size &&
2165 instr->intrinsic != nir_intrinsic_image_samples &&
2166 instr->intrinsic != nir_intrinsic_bindless_image_size &&
2167 instr->intrinsic != nir_intrinsic_bindless_image_samples) {
2168 struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2169
2170 if (dim == GLSL_SAMPLER_DIM_MS) {
2171 temp = ntt_temp(c);
2172 ntt_MOV(c, temp, coord);
2173 ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2174 ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2175 coord = ureg_src(temp);
2176 }
2177 srcs[num_src++] = coord;
2178
2179 if (instr->intrinsic != nir_intrinsic_image_load &&
2180 instr->intrinsic != nir_intrinsic_bindless_image_load) {
2181 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2182 if (instr->intrinsic == nir_intrinsic_image_atomic_swap ||
2183 instr->intrinsic == nir_intrinsic_bindless_image_atomic_swap)
2184 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2185 }
2186 }
2187
2188 switch (instr->intrinsic) {
2189 case nir_intrinsic_image_load:
2190 case nir_intrinsic_bindless_image_load:
2191 op = TGSI_OPCODE_LOAD;
2192 break;
2193 case nir_intrinsic_image_store:
2194 case nir_intrinsic_bindless_image_store:
2195 op = TGSI_OPCODE_STORE;
2196 break;
2197 case nir_intrinsic_image_size:
2198 case nir_intrinsic_bindless_image_size:
2199 op = TGSI_OPCODE_RESQ;
2200 break;
2201 case nir_intrinsic_image_samples:
2202 case nir_intrinsic_bindless_image_samples:
2203 op = TGSI_OPCODE_RESQ;
2204 opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2205 break;
2206 case nir_intrinsic_image_atomic:
2207 case nir_intrinsic_bindless_image_atomic:
2208 op = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2209 break;
2210 case nir_intrinsic_image_atomic_swap:
2211 case nir_intrinsic_bindless_image_atomic_swap:
2212 op = TGSI_OPCODE_ATOMCAS;
2213 break;
2214 default:
2215 unreachable("bad op");
2216 }
2217
2218 struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2219 insn->tex_target = target;
2220 insn->mem_qualifier = ntt_get_access_qualifier(instr);
2221 insn->mem_format = nir_intrinsic_format(instr);
2222 insn->is_mem = true;
2223
2224 if (instr->intrinsic == nir_intrinsic_image_samples ||
2225 instr->intrinsic == nir_intrinsic_bindless_image_samples)
2226 ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2227 }
2228
2229 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2230 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2231 {
2232 uint32_t frac = nir_intrinsic_component(instr);
2233 uint32_t num_components = instr->num_components;
2234 unsigned base = nir_intrinsic_base(instr);
2235 struct ureg_src input;
2236 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2237 bool is_64 = instr->def.bit_size == 64;
2238
2239 if (c->s->info.stage == MESA_SHADER_VERTEX) {
2240 input = ureg_DECL_vs_input(c->ureg, base);
2241 for (int i = 1; i < semantics.num_slots; i++)
2242 ureg_DECL_vs_input(c->ureg, base + i);
2243 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2244 unsigned semantic_name, semantic_index;
2245 ntt_get_gl_varying_semantic(c, semantics.location,
2246 &semantic_name, &semantic_index);
2247
2248 /* XXX: ArrayID is used in r600 gs inputs */
2249 uint32_t array_id = 0;
2250
2251 input = ureg_DECL_input_layout(c->ureg,
2252 semantic_name,
2253 semantic_index,
2254 base,
2255 ntt_tgsi_usage_mask(frac,
2256 instr->num_components,
2257 is_64),
2258 array_id,
2259 semantics.num_slots);
2260 } else {
2261 input = c->input_index_map[base];
2262 }
2263
2264 if (is_64)
2265 num_components *= 2;
2266
2267 input = ntt_shift_by_frac(input, frac, num_components);
2268
2269 switch (instr->intrinsic) {
2270 case nir_intrinsic_load_input:
2271 input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2272 ntt_store(c, &instr->def, input);
2273 break;
2274
2275 case nir_intrinsic_load_per_vertex_input:
2276 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2277 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2278 ntt_store(c, &instr->def, input);
2279 break;
2280
2281 case nir_intrinsic_load_interpolated_input: {
2282 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2283
2284 nir_intrinsic_instr *bary_instr =
2285 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2286
2287 switch (bary_instr->intrinsic) {
2288 case nir_intrinsic_load_barycentric_pixel:
2289 case nir_intrinsic_load_barycentric_sample:
2290 /* For these, we know that the barycentric load matches the
2291 * interpolation on the input declaration, so we can use it directly.
2292 */
2293 ntt_store(c, &instr->def, input);
2294 break;
2295
2296 case nir_intrinsic_load_barycentric_centroid:
2297 /* If the input was declared centroid, then there's no need to
2298 * emit the extra TGSI interp instruction, we can just read the
2299 * input.
2300 */
2301 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2302 ntt_store(c, &instr->def, input);
2303 } else {
2304 ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->def), input);
2305 }
2306 break;
2307
2308 case nir_intrinsic_load_barycentric_at_sample:
2309 /* We stored the sample in the fake "bary" dest. */
2310 ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->def), input,
2311 ntt_get_src(c, instr->src[0]));
2312 break;
2313
2314 case nir_intrinsic_load_barycentric_at_offset:
2315 /* We stored the offset in the fake "bary" dest. */
2316 ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->def), input,
2317 ntt_get_src(c, instr->src[0]));
2318 break;
2319
2320 default:
2321 unreachable("bad barycentric interp intrinsic\n");
2322 }
2323 break;
2324 }
2325
2326 default:
2327 unreachable("bad load input intrinsic\n");
2328 }
2329 }
2330
2331 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2332 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2333 {
2334 struct ureg_src src = ntt_get_src(c, instr->src[0]);
2335
2336 if (src.File == TGSI_FILE_OUTPUT) {
2337 /* If our src is the output file, that's an indication that we were able
2338 * to emit the output stores in the generating instructions and we have
2339 * nothing to do here.
2340 */
2341 return;
2342 }
2343
2344 uint32_t frac;
2345 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2346
2347 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2348 out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2349 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2350 } else {
2351 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2352 }
2353
2354 uint8_t swizzle[4] = { 0, 0, 0, 0 };
2355 for (int i = frac; i <= 4; i++) {
2356 if (out.WriteMask & (1 << i))
2357 swizzle[i] = i - frac;
2358 }
2359
2360 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2361
2362 ntt_MOV(c, out, src);
2363 }
2364
2365 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2366 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2367 {
2368 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2369
2370 /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2371 * load_output is present.
2372 */
2373 assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2374 (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2375
2376 uint32_t frac;
2377 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2378
2379 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2380 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2381 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2382 } else {
2383 out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2384 }
2385
2386 struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2387 struct ureg_src out_src = ureg_src(out);
2388
2389 /* Don't swizzling unavailable channels of the output in the writemasked-out
2390 * components. Avoids compile failures in virglrenderer with
2391 * TESS_LEVEL_INNER.
2392 */
2393 int fill_channel = ffs(dst.WriteMask) - 1;
2394 uint8_t swizzles[4] = { 0, 1, 2, 3 };
2395 for (int i = 0; i < 4; i++)
2396 if (!(dst.WriteMask & (1 << i)))
2397 swizzles[i] = fill_channel;
2398 out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2399
2400 if (semantics.fb_fetch_output)
2401 ntt_FBFETCH(c, dst, out_src);
2402 else
2403 ntt_MOV(c, dst, out_src);
2404 }
2405
2406 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2407 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2408 {
2409 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2410 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2411 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2412
2413 /* virglrenderer doesn't like references to channels of the sysval that
2414 * aren't defined, even if they aren't really read. (GLSL compile fails on
2415 * gl_NumWorkGroups.w, for example).
2416 */
2417 uint32_t write_mask = BITSET_MASK(instr->def.num_components);
2418 sv = ntt_swizzle_for_write_mask(sv, write_mask);
2419
2420 /* TGSI and NIR define these intrinsics as always loading ints, but they can
2421 * still appear on hardware with non-native-integers fragment shaders using
2422 * the draw path (i915g). In that case, having called nir_lower_int_to_float
2423 * means that we actually want floats instead.
2424 */
2425 if (!c->native_integers) {
2426 switch (instr->intrinsic) {
2427 case nir_intrinsic_load_vertex_id:
2428 case nir_intrinsic_load_instance_id:
2429 ntt_U2F(c, ntt_get_dest(c, &instr->def), sv);
2430 return;
2431
2432 default:
2433 break;
2434 }
2435 }
2436
2437 ntt_store(c, &instr->def, sv);
2438 }
2439
2440 static void
ntt_emit_barrier(struct ntt_compile * c,nir_intrinsic_instr * intr)2441 ntt_emit_barrier(struct ntt_compile *c, nir_intrinsic_instr *intr)
2442 {
2443 bool compute = gl_shader_stage_is_compute(c->s->info.stage);
2444
2445 if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
2446 nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2447 unsigned membar = 0;
2448
2449 if (modes & nir_var_image)
2450 membar |= TGSI_MEMBAR_SHADER_IMAGE;
2451
2452 if (modes & nir_var_mem_shared)
2453 membar |= TGSI_MEMBAR_SHARED;
2454
2455 /* Atomic counters are lowered to SSBOs, there's no NIR mode corresponding
2456 * exactly to atomics. Take the closest match.
2457 */
2458 if (modes & nir_var_mem_ssbo)
2459 membar |= TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER;
2460
2461 if (modes & nir_var_mem_global)
2462 membar |= TGSI_MEMBAR_SHADER_BUFFER;
2463
2464 /* Hack for virglrenderer: the GLSL specific memory barrier functions,
2465 * memoryBarrier{Buffer,Image,Shared,AtomicCounter}(), are only
2466 * available in compute shaders prior to GLSL 4.30. In other stages,
2467 * it needs to use the full memoryBarrier(). It may be possible to
2468 * make them available via #extension directives in older versions,
2469 * but it's confusingly underspecified, and Mesa/virglrenderer don't
2470 * currently agree on how to do it. So, just promote partial memory
2471 * barriers back to full ones outside of compute shaders when asked.
2472 */
2473 if (membar && !compute &&
2474 c->options->non_compute_membar_needs_all_modes) {
2475 membar |= TGSI_MEMBAR_SHADER_BUFFER |
2476 TGSI_MEMBAR_ATOMIC_BUFFER |
2477 TGSI_MEMBAR_SHADER_IMAGE |
2478 TGSI_MEMBAR_SHARED;
2479 }
2480
2481 /* If we only need workgroup scope (not device-scope), we might be able to
2482 * optimize a bit.
2483 */
2484 if (membar && compute &&
2485 nir_intrinsic_memory_scope(intr) == SCOPE_WORKGROUP) {
2486
2487 membar |= TGSI_MEMBAR_THREAD_GROUP;
2488 }
2489
2490 /* Only emit a memory barrier if there are any relevant modes */
2491 if (membar)
2492 ntt_MEMBAR(c, ureg_imm1u(c->ureg, membar));
2493 }
2494
2495 if (nir_intrinsic_execution_scope(intr) != SCOPE_NONE) {
2496 assert(compute || c->s->info.stage == MESA_SHADER_TESS_CTRL);
2497 ntt_BARRIER(c);
2498 }
2499 }
2500
2501 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2502 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2503 {
2504 switch (instr->intrinsic) {
2505 case nir_intrinsic_load_ubo:
2506 case nir_intrinsic_load_ubo_vec4:
2507 ntt_emit_load_ubo(c, instr);
2508 break;
2509
2510 /* Vertex */
2511 case nir_intrinsic_load_vertex_id:
2512 case nir_intrinsic_load_vertex_id_zero_base:
2513 case nir_intrinsic_load_base_vertex:
2514 case nir_intrinsic_load_base_instance:
2515 case nir_intrinsic_load_instance_id:
2516 case nir_intrinsic_load_draw_id:
2517 case nir_intrinsic_load_invocation_id:
2518 case nir_intrinsic_load_frag_coord:
2519 case nir_intrinsic_load_point_coord:
2520 case nir_intrinsic_load_front_face:
2521 case nir_intrinsic_load_sample_id:
2522 case nir_intrinsic_load_sample_pos:
2523 case nir_intrinsic_load_sample_mask_in:
2524 case nir_intrinsic_load_helper_invocation:
2525 case nir_intrinsic_load_tess_coord:
2526 case nir_intrinsic_load_patch_vertices_in:
2527 case nir_intrinsic_load_primitive_id:
2528 case nir_intrinsic_load_tess_level_outer:
2529 case nir_intrinsic_load_tess_level_inner:
2530 case nir_intrinsic_load_local_invocation_id:
2531 case nir_intrinsic_load_workgroup_id:
2532 case nir_intrinsic_load_num_workgroups:
2533 case nir_intrinsic_load_workgroup_size:
2534 case nir_intrinsic_load_subgroup_size:
2535 case nir_intrinsic_load_subgroup_invocation:
2536 case nir_intrinsic_load_subgroup_eq_mask:
2537 case nir_intrinsic_load_subgroup_ge_mask:
2538 case nir_intrinsic_load_subgroup_gt_mask:
2539 case nir_intrinsic_load_subgroup_lt_mask:
2540 case nir_intrinsic_load_subgroup_le_mask:
2541 ntt_emit_load_sysval(c, instr);
2542 break;
2543
2544 case nir_intrinsic_load_input:
2545 case nir_intrinsic_load_per_vertex_input:
2546 case nir_intrinsic_load_interpolated_input:
2547 ntt_emit_load_input(c, instr);
2548 break;
2549
2550 case nir_intrinsic_store_output:
2551 case nir_intrinsic_store_per_vertex_output:
2552 ntt_emit_store_output(c, instr);
2553 break;
2554
2555 case nir_intrinsic_load_output:
2556 case nir_intrinsic_load_per_vertex_output:
2557 ntt_emit_load_output(c, instr);
2558 break;
2559
2560 case nir_intrinsic_demote:
2561 ntt_DEMOTE(c);
2562 break;
2563
2564 case nir_intrinsic_discard:
2565 ntt_KILL(c);
2566 break;
2567
2568 case nir_intrinsic_discard_if: {
2569 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2570
2571 if (c->native_integers) {
2572 struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2573 ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2574 ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2575 } else {
2576 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2577 ntt_KILL_IF(c, ureg_negate(cond));
2578 }
2579 break;
2580 }
2581
2582 case nir_intrinsic_is_helper_invocation:
2583 ntt_READ_HELPER(c, ntt_get_dest(c, &instr->def));
2584 break;
2585
2586 case nir_intrinsic_vote_all:
2587 ntt_VOTE_ALL(c, ntt_get_dest(c, &instr->def), ntt_get_src(c,instr->src[0]));
2588 return;
2589 case nir_intrinsic_vote_any:
2590 ntt_VOTE_ANY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2591 return;
2592 case nir_intrinsic_vote_ieq:
2593 ntt_VOTE_EQ(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2594 return;
2595 case nir_intrinsic_ballot:
2596 ntt_BALLOT(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2597 return;
2598 case nir_intrinsic_read_first_invocation:
2599 ntt_READ_FIRST(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2600 return;
2601 case nir_intrinsic_read_invocation:
2602 ntt_READ_INVOC(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]), ntt_get_src(c, instr->src[1]));
2603 return;
2604
2605 case nir_intrinsic_load_ssbo:
2606 case nir_intrinsic_store_ssbo:
2607 case nir_intrinsic_ssbo_atomic:
2608 case nir_intrinsic_ssbo_atomic_swap:
2609 case nir_intrinsic_get_ssbo_size:
2610 ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2611 break;
2612
2613 case nir_intrinsic_load_shared:
2614 case nir_intrinsic_store_shared:
2615 case nir_intrinsic_shared_atomic:
2616 case nir_intrinsic_shared_atomic_swap:
2617 ntt_emit_mem(c, instr, nir_var_mem_shared);
2618 break;
2619
2620 case nir_intrinsic_atomic_counter_read:
2621 case nir_intrinsic_atomic_counter_add:
2622 case nir_intrinsic_atomic_counter_inc:
2623 case nir_intrinsic_atomic_counter_post_dec:
2624 case nir_intrinsic_atomic_counter_min:
2625 case nir_intrinsic_atomic_counter_max:
2626 case nir_intrinsic_atomic_counter_and:
2627 case nir_intrinsic_atomic_counter_or:
2628 case nir_intrinsic_atomic_counter_xor:
2629 case nir_intrinsic_atomic_counter_exchange:
2630 case nir_intrinsic_atomic_counter_comp_swap:
2631 ntt_emit_mem(c, instr, nir_var_uniform);
2632 break;
2633 case nir_intrinsic_atomic_counter_pre_dec:
2634 unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2635 break;
2636
2637 case nir_intrinsic_image_load:
2638 case nir_intrinsic_image_store:
2639 case nir_intrinsic_image_size:
2640 case nir_intrinsic_image_samples:
2641 case nir_intrinsic_image_atomic:
2642 case nir_intrinsic_image_atomic_swap:
2643 case nir_intrinsic_bindless_image_load:
2644 case nir_intrinsic_bindless_image_store:
2645 case nir_intrinsic_bindless_image_size:
2646 case nir_intrinsic_bindless_image_samples:
2647 case nir_intrinsic_bindless_image_atomic:
2648 case nir_intrinsic_bindless_image_atomic_swap:
2649 ntt_emit_image_load_store(c, instr);
2650 break;
2651
2652 case nir_intrinsic_barrier:
2653 ntt_emit_barrier(c, instr);
2654 break;
2655
2656 case nir_intrinsic_end_primitive:
2657 ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2658 break;
2659
2660 case nir_intrinsic_emit_vertex:
2661 ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2662 break;
2663
2664 /* In TGSI we don't actually generate the barycentric coords, and emit
2665 * interp intrinsics later. However, we do need to store the
2666 * load_barycentric_at_* argument so that we can use it at that point.
2667 */
2668 case nir_intrinsic_load_barycentric_pixel:
2669 case nir_intrinsic_load_barycentric_centroid:
2670 case nir_intrinsic_load_barycentric_sample:
2671 break;
2672 case nir_intrinsic_load_barycentric_at_sample:
2673 case nir_intrinsic_load_barycentric_at_offset:
2674 ntt_store(c, &instr->def, ntt_get_src(c, instr->src[0]));
2675 break;
2676
2677 case nir_intrinsic_shader_clock:
2678 ntt_CLOCK(c, ntt_get_dest(c, &instr->def));
2679 break;
2680
2681 case nir_intrinsic_decl_reg:
2682 case nir_intrinsic_load_reg:
2683 case nir_intrinsic_load_reg_indirect:
2684 case nir_intrinsic_store_reg:
2685 case nir_intrinsic_store_reg_indirect:
2686 /* fully consumed */
2687 break;
2688
2689 default:
2690 fprintf(stderr, "Unknown intrinsic: ");
2691 nir_print_instr(&instr->instr, stderr);
2692 fprintf(stderr, "\n");
2693 break;
2694 }
2695 }
2696
2697 struct ntt_tex_operand_state {
2698 struct ureg_src srcs[4];
2699 unsigned i;
2700 };
2701
2702 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2703 ntt_push_tex_arg(struct ntt_compile *c,
2704 nir_tex_instr *instr,
2705 nir_tex_src_type tex_src_type,
2706 struct ntt_tex_operand_state *s)
2707 {
2708 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2709 if (tex_src < 0)
2710 return;
2711
2712 nir_src *src = &instr->src[tex_src].src;
2713
2714 /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2715 * that TG4's immediate offset arg is float-typed.
2716 */
2717 if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2718 nir_src_is_const(*src)) {
2719 nir_const_value *consts = nir_src_as_const_value(*src);
2720 s->srcs[s->i++] = ureg_imm4f(c->ureg,
2721 consts[0].f32,
2722 consts[1].f32,
2723 consts[2].f32,
2724 consts[3].f32);
2725 return;
2726 }
2727
2728 s->srcs[s->i++] = ntt_get_src(c, *src);
2729 }
2730
2731 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2732 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2733 {
2734 struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2735 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2736 unsigned tex_opcode;
2737
2738 int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
2739 int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
2740
2741 struct ureg_src sampler;
2742 if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
2743 /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
2744 sampler = ntt_get_src(c, instr->src[tex_handle_src].src);
2745 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
2746 } else {
2747 assert(tex_handle_src == -1 && sampler_handle_src == -1);
2748 sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2749 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2750 if (sampler_src >= 0) {
2751 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2752 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2753 }
2754 }
2755
2756 switch (instr->op) {
2757 case nir_texop_tex:
2758 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2759 MAX2(instr->coord_components, 2) + instr->is_shadow)
2760 tex_opcode = TGSI_OPCODE_TXP;
2761 else
2762 tex_opcode = TGSI_OPCODE_TEX;
2763 break;
2764 case nir_texop_txf:
2765 case nir_texop_txf_ms:
2766 tex_opcode = TGSI_OPCODE_TXF;
2767
2768 if (c->has_txf_lz) {
2769 int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2770 if (lod_src >= 0 &&
2771 nir_src_is_const(instr->src[lod_src].src) &&
2772 ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2773 tex_opcode = TGSI_OPCODE_TXF_LZ;
2774 }
2775 }
2776 break;
2777 case nir_texop_txl:
2778 tex_opcode = TGSI_OPCODE_TXL;
2779 break;
2780 case nir_texop_txb:
2781 tex_opcode = TGSI_OPCODE_TXB;
2782 break;
2783 case nir_texop_txd:
2784 tex_opcode = TGSI_OPCODE_TXD;
2785 break;
2786 case nir_texop_txs:
2787 tex_opcode = TGSI_OPCODE_TXQ;
2788 break;
2789 case nir_texop_tg4:
2790 tex_opcode = TGSI_OPCODE_TG4;
2791 break;
2792 case nir_texop_query_levels:
2793 tex_opcode = TGSI_OPCODE_TXQ;
2794 break;
2795 case nir_texop_lod:
2796 tex_opcode = TGSI_OPCODE_LODQ;
2797 break;
2798 case nir_texop_texture_samples:
2799 tex_opcode = TGSI_OPCODE_TXQS;
2800 break;
2801 default:
2802 unreachable("unsupported tex op");
2803 }
2804
2805 struct ntt_tex_operand_state s = { .i = 0 };
2806 ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2807 ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2808
2809 /* non-coord arg for TXQ */
2810 if (tex_opcode == TGSI_OPCODE_TXQ) {
2811 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2812 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2813 * scalar
2814 */
2815 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2816 }
2817
2818 if (s.i > 1) {
2819 if (tex_opcode == TGSI_OPCODE_TEX)
2820 tex_opcode = TGSI_OPCODE_TEX2;
2821 if (tex_opcode == TGSI_OPCODE_TXB)
2822 tex_opcode = TGSI_OPCODE_TXB2;
2823 if (tex_opcode == TGSI_OPCODE_TXL)
2824 tex_opcode = TGSI_OPCODE_TXL2;
2825 }
2826
2827 if (instr->op == nir_texop_txd) {
2828 /* Derivs appear in their own src args */
2829 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2830 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2831 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2832 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2833 }
2834
2835 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2836 if (c->screen->get_param(c->screen,
2837 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2838 sampler = ureg_scalar(sampler, instr->component);
2839 s.srcs[s.i++] = ureg_src_undef();
2840 } else {
2841 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2842 }
2843 }
2844
2845 s.srcs[s.i++] = sampler;
2846
2847 enum tgsi_return_type tex_type;
2848 switch (instr->dest_type) {
2849 case nir_type_float32:
2850 tex_type = TGSI_RETURN_TYPE_FLOAT;
2851 break;
2852 case nir_type_int32:
2853 tex_type = TGSI_RETURN_TYPE_SINT;
2854 break;
2855 case nir_type_uint32:
2856 tex_type = TGSI_RETURN_TYPE_UINT;
2857 break;
2858 default:
2859 unreachable("unknown texture type");
2860 }
2861
2862 struct ureg_dst tex_dst;
2863 if (instr->op == nir_texop_query_levels)
2864 tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2865 else
2866 tex_dst = dst;
2867
2868 while (s.i < 4)
2869 s.srcs[s.i++] = ureg_src_undef();
2870
2871 struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2872 insn->tex_target = target;
2873 insn->tex_return_type = tex_type;
2874 insn->is_tex = true;
2875
2876 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2877 if (tex_offset_src >= 0) {
2878 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2879
2880 insn->tex_offset[0].File = offset.File;
2881 insn->tex_offset[0].Index = offset.Index;
2882 insn->tex_offset[0].SwizzleX = offset.SwizzleX;
2883 insn->tex_offset[0].SwizzleY = offset.SwizzleY;
2884 insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
2885 insn->tex_offset[0].Padding = 0;
2886 }
2887
2888 if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
2889 for (uint8_t i = 0; i < 4; ++i) {
2890 struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
2891 insn->tex_offset[i].File = imm.File;
2892 insn->tex_offset[i].Index = imm.Index;
2893 insn->tex_offset[i].SwizzleX = imm.SwizzleX;
2894 insn->tex_offset[i].SwizzleY = imm.SwizzleY;
2895 insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
2896 }
2897 }
2898
2899 if (instr->op == nir_texop_query_levels)
2900 ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2901 }
2902
2903 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2904 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2905 {
2906 switch (jump->type) {
2907 case nir_jump_break:
2908 ntt_BRK(c);
2909 break;
2910
2911 case nir_jump_continue:
2912 ntt_CONT(c);
2913 break;
2914
2915 default:
2916 fprintf(stderr, "Unknown jump instruction: ");
2917 nir_print_instr(&jump->instr, stderr);
2918 fprintf(stderr, "\n");
2919 abort();
2920 }
2921 }
2922
2923 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_undef_instr * instr)2924 ntt_emit_ssa_undef(struct ntt_compile *c, nir_undef_instr *instr)
2925 {
2926 /* Nothing to do but make sure that we have some storage to deref. */
2927 (void)ntt_get_ssa_def_decl(c, &instr->def);
2928 }
2929
2930 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2931 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2932 {
2933 switch (instr->type) {
2934 case nir_instr_type_deref:
2935 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2936 break;
2937
2938 case nir_instr_type_alu:
2939 ntt_emit_alu(c, nir_instr_as_alu(instr));
2940 break;
2941
2942 case nir_instr_type_intrinsic:
2943 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2944 break;
2945
2946 case nir_instr_type_load_const:
2947 /* Nothing to do here, as load consts are done directly from
2948 * ntt_get_src() (since many constant NIR srcs will often get folded
2949 * directly into a register file index instead of as a TGSI src).
2950 */
2951 break;
2952
2953 case nir_instr_type_tex:
2954 ntt_emit_texture(c, nir_instr_as_tex(instr));
2955 break;
2956
2957 case nir_instr_type_jump:
2958 ntt_emit_jump(c, nir_instr_as_jump(instr));
2959 break;
2960
2961 case nir_instr_type_undef:
2962 ntt_emit_ssa_undef(c, nir_instr_as_undef(instr));
2963 break;
2964
2965 default:
2966 fprintf(stderr, "Unknown NIR instr type: ");
2967 nir_print_instr(instr, stderr);
2968 fprintf(stderr, "\n");
2969 abort();
2970 }
2971 }
2972
2973 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2974 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2975 {
2976 if (c->native_integers)
2977 ntt_UIF(c, c->if_cond);
2978 else
2979 ntt_IF(c, c->if_cond);
2980
2981 ntt_emit_cf_list(c, &if_stmt->then_list);
2982
2983 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2984 ntt_ELSE(c);
2985 ntt_emit_cf_list(c, &if_stmt->else_list);
2986 }
2987
2988 ntt_ENDIF(c);
2989 }
2990
2991 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2992 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2993 {
2994 assert(!nir_loop_has_continue_construct(loop));
2995 ntt_BGNLOOP(c);
2996 ntt_emit_cf_list(c, &loop->body);
2997 ntt_ENDLOOP(c);
2998 }
2999
3000 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)3001 ntt_emit_block(struct ntt_compile *c, nir_block *block)
3002 {
3003 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3004 c->cur_block = ntt_block;
3005
3006 nir_foreach_instr(instr, block) {
3007 ntt_emit_instr(c, instr);
3008
3009 /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
3010 if (ureg_get_instruction_number(c->ureg) != 0) {
3011 fprintf(stderr, "Emitted ureg insn during: ");
3012 nir_print_instr(instr, stderr);
3013 fprintf(stderr, "\n");
3014 unreachable("emitted ureg insn");
3015 }
3016 }
3017
3018 /* Set up the if condition for ntt_emit_if(), which we have to do before
3019 * freeing up the temps (the "if" is treated as inside the block for liveness
3020 * purposes, despite not being an instruction)
3021 *
3022 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
3023 * looks at all of .xyzw. No harm in working around the bug.
3024 */
3025 nir_if *nif = nir_block_get_following_if(block);
3026 if (nif)
3027 c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
3028 }
3029
3030 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)3031 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
3032 {
3033 foreach_list_typed(nir_cf_node, node, node, list) {
3034 switch (node->type) {
3035 case nir_cf_node_block:
3036 ntt_emit_block(c, nir_cf_node_as_block(node));
3037 break;
3038
3039 case nir_cf_node_if:
3040 ntt_emit_if(c, nir_cf_node_as_if(node));
3041 break;
3042
3043 case nir_cf_node_loop:
3044 ntt_emit_loop(c, nir_cf_node_as_loop(node));
3045 break;
3046
3047 default:
3048 unreachable("unknown CF type");
3049 }
3050 }
3051 }
3052
3053 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)3054 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
3055 {
3056 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3057
3058 /* Emit the ntt insns to tgsi_ureg. */
3059 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
3060 const struct tgsi_opcode_info *opcode_info =
3061 tgsi_get_opcode_info(insn->opcode);
3062
3063 switch (insn->opcode) {
3064 case TGSI_OPCODE_UIF:
3065 ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
3066 break;
3067
3068 case TGSI_OPCODE_IF:
3069 ureg_IF(c->ureg, insn->src[0], &c->cf_label);
3070 break;
3071
3072 case TGSI_OPCODE_ELSE:
3073 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3074 ureg_ELSE(c->ureg, &c->cf_label);
3075 c->current_if_else = c->cf_label;
3076 break;
3077
3078 case TGSI_OPCODE_ENDIF:
3079 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3080 ureg_ENDIF(c->ureg);
3081 break;
3082
3083 case TGSI_OPCODE_BGNLOOP:
3084 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3085 * does reference BGNLOOP's. Follow the former behavior unless something comes up
3086 * with a need.
3087 */
3088 ureg_BGNLOOP(c->ureg, &c->cf_label);
3089 break;
3090
3091 case TGSI_OPCODE_ENDLOOP:
3092 ureg_ENDLOOP(c->ureg, &c->cf_label);
3093 break;
3094
3095 default:
3096 if (insn->is_tex) {
3097 int num_offsets = 0;
3098 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
3099 if (insn->tex_offset[i].File != TGSI_FILE_NULL)
3100 num_offsets = i + 1;
3101 }
3102 ureg_tex_insn(c->ureg, insn->opcode,
3103 insn->dst, opcode_info->num_dst,
3104 insn->tex_target, insn->tex_return_type,
3105 insn->tex_offset,
3106 num_offsets,
3107 insn->src, opcode_info->num_src);
3108 } else if (insn->is_mem) {
3109 ureg_memory_insn(c->ureg, insn->opcode,
3110 insn->dst, opcode_info->num_dst,
3111 insn->src, opcode_info->num_src,
3112 insn->mem_qualifier,
3113 insn->tex_target,
3114 insn->mem_format);
3115 } else {
3116 ureg_insn(c->ureg, insn->opcode,
3117 insn->dst, opcode_info->num_dst,
3118 insn->src, opcode_info->num_src,
3119 insn->precise);
3120 }
3121 }
3122 }
3123 }
3124
3125 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)3126 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
3127 {
3128 /* Note: the last block emitted our IF opcode. */
3129
3130 int if_stack = c->current_if_else;
3131 c->current_if_else = c->cf_label;
3132
3133 /* Either the then or else block includes the ENDIF, which will fix up the
3134 * IF(/ELSE)'s label for jumping
3135 */
3136 ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
3137 ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
3138
3139 c->current_if_else = if_stack;
3140 }
3141
3142 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)3143 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
3144 {
3145 foreach_list_typed(nir_cf_node, node, node, list) {
3146 switch (node->type) {
3147 case nir_cf_node_block:
3148 ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
3149 break;
3150
3151 case nir_cf_node_if:
3152 ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3153 break;
3154
3155 case nir_cf_node_loop:
3156 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3157 * does reference BGNLOOP's. Follow the former behavior unless something comes up
3158 * with a need.
3159 */
3160 ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3161 break;
3162
3163 default:
3164 unreachable("unknown CF type");
3165 }
3166 }
3167 }
3168
3169 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3170 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3171 {
3172 c->impl = impl;
3173
3174 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3175 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
3176
3177 /* Set up the struct ntt_blocks to put insns in */
3178 c->blocks = _mesa_pointer_hash_table_create(c);
3179 nir_foreach_block(block, impl) {
3180 struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3181 util_dynarray_init(&ntt_block->insns, ntt_block);
3182 _mesa_hash_table_insert(c->blocks, block, ntt_block);
3183 }
3184
3185
3186 ntt_setup_registers(c);
3187
3188 c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3189 ntt_setup_inputs(c);
3190 ntt_setup_outputs(c);
3191 ntt_setup_uniforms(c);
3192
3193 /* Emit the ntt insns */
3194 ntt_emit_cf_list(c, &impl->body);
3195
3196 /* Don't do optimized RA if the driver requests it, unless the number of
3197 * temps is too large to be covered by the 16 bit signed int that TGSI
3198 * allocates for the register index */
3199 if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3200 ntt_allocate_regs(c, impl);
3201 else
3202 ntt_allocate_regs_unoptimized(c, impl);
3203
3204 /* Turn the ntt insns into actual TGSI tokens */
3205 ntt_emit_cf_list_ureg(c, &impl->body);
3206
3207 ralloc_free(c->liveness);
3208 c->liveness = NULL;
3209
3210 }
3211
3212 static int
type_size(const struct glsl_type * type,bool bindless)3213 type_size(const struct glsl_type *type, bool bindless)
3214 {
3215 return glsl_count_attribute_slots(type, false);
3216 }
3217
3218 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3219 * can handle for 64-bit values in TGSI.
3220 */
3221 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3222 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3223 {
3224 if (instr->type != nir_instr_type_alu)
3225 return 0;
3226
3227 nir_alu_instr *alu = nir_instr_as_alu(instr);
3228
3229 switch (alu->op) {
3230 case nir_op_ibitfield_extract:
3231 case nir_op_ubitfield_extract:
3232 case nir_op_bitfield_insert:
3233 /* virglrenderer only looks at the .x channel of the offset/bits operands
3234 * when translating to GLSL. tgsi.rst doesn't seem to require scalar
3235 * offset/bits operands.
3236 *
3237 * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3238 */
3239 return 1;
3240
3241 default:
3242 break;
3243 }
3244
3245 int src_bit_size = nir_src_bit_size(alu->src[0].src);
3246 int dst_bit_size = alu->def.bit_size;
3247
3248 if (src_bit_size == 64 || dst_bit_size == 64) {
3249 /* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst
3250 * claiming support, virglrenderer generates bad shaders on the host when
3251 * presented with them. Maybe we can make virgl avoid tickling the
3252 * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3253 * 64-bit instrs in the first place, I don't see much reason to care about
3254 * this.
3255 */
3256 return 1;
3257 }
3258
3259 return 4;
3260 }
3261
3262 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3263 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3264 unsigned num_components, unsigned high_offset,
3265 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3266 void *data)
3267 {
3268 if (bit_size != 32)
3269 return false;
3270
3271 /* Our offset alignment should aways be at least 4 bytes */
3272 if (align < 4)
3273 return false;
3274
3275 /* No wrapping off the end of a TGSI reg. We could do a bit better by
3276 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
3277 * need this restriction.
3278 */
3279 unsigned worst_start_component = align == 4 ? 3 : align / 4;
3280 if (worst_start_component + num_components > 4)
3281 return false;
3282
3283 return true;
3284 }
3285
3286 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3287 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3288 {
3289 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3290 unsigned indirect_mask = 0;
3291
3292 if (!screen->get_shader_param(screen, pipe_stage,
3293 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3294 indirect_mask |= nir_var_shader_in;
3295 }
3296
3297 if (!screen->get_shader_param(screen, pipe_stage,
3298 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3299 indirect_mask |= nir_var_shader_out;
3300 }
3301
3302 if (!screen->get_shader_param(screen, pipe_stage,
3303 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3304 indirect_mask |= nir_var_function_temp;
3305 }
3306
3307 return indirect_mask;
3308 }
3309
3310 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3311 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen,
3312 const struct nir_to_tgsi_options *options)
3313 {
3314 bool progress;
3315 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3316 unsigned control_flow_depth =
3317 screen->get_shader_param(screen, pipe_stage,
3318 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3319 do {
3320 progress = false;
3321
3322 NIR_PASS_V(s, nir_lower_vars_to_ssa);
3323 NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3324
3325 NIR_PASS(progress, s, nir_copy_prop);
3326 NIR_PASS(progress, s, nir_opt_algebraic);
3327 NIR_PASS(progress, s, nir_opt_constant_folding);
3328 NIR_PASS(progress, s, nir_opt_remove_phis);
3329 NIR_PASS(progress, s, nir_opt_conditional_discard);
3330 NIR_PASS(progress, s, nir_opt_dce);
3331 NIR_PASS(progress, s, nir_opt_dead_cf);
3332 NIR_PASS(progress, s, nir_opt_cse);
3333 NIR_PASS(progress, s, nir_opt_find_array_copies);
3334 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3335 NIR_PASS(progress, s, nir_opt_dead_write_vars);
3336
3337 NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
3338 NIR_PASS(progress, s, nir_opt_peephole_select,
3339 control_flow_depth == 0 ? ~0 : 8, true, true);
3340 NIR_PASS(progress, s, nir_opt_algebraic);
3341 NIR_PASS(progress, s, nir_opt_constant_folding);
3342 nir_load_store_vectorize_options vectorize_opts = {
3343 .modes = nir_var_mem_ubo,
3344 .callback = ntt_should_vectorize_io,
3345 .robust_modes = 0,
3346 };
3347 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3348 NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3349 NIR_PASS(progress, s, nir_opt_shrink_vectors);
3350 NIR_PASS(progress, s, nir_opt_loop);
3351 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3352 NIR_PASS(progress, s, nir_opt_undef);
3353 NIR_PASS(progress, s, nir_opt_loop_unroll);
3354
3355 /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3356 * and ALU ops for it.
3357 */
3358 nir_opt_offsets_options offset_options = {
3359 .ubo_vec4_max = ~0,
3360
3361 /* No const offset in TGSI for shared accesses. */
3362 .shared_max = 0,
3363
3364 /* unused intrinsics */
3365 .uniform_max = 0,
3366 .buffer_max = 0,
3367 };
3368
3369 if (options->ubo_vec4_max)
3370 offset_options.ubo_vec4_max = options->ubo_vec4_max;
3371
3372 NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3373 } while (progress);
3374
3375 NIR_PASS_V(s, nir_lower_var_copies);
3376 }
3377
3378 /* Scalarizes all 64-bit ALU ops. Note that we only actually need to
3379 * scalarize vec3/vec4s, should probably fix that.
3380 */
3381 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3382 scalarize_64bit(const nir_instr *instr, const void *data)
3383 {
3384 const nir_alu_instr *alu = nir_instr_as_alu(instr);
3385
3386 return (alu->def.bit_size == 64 ||
3387 nir_src_bit_size(alu->src[0].src) == 64);
3388 }
3389
3390 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3391 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3392 {
3393 b->cursor = nir_after_instr(&instr->instr);
3394
3395 switch (instr->intrinsic) {
3396 case nir_intrinsic_load_ubo:
3397 case nir_intrinsic_load_ubo_vec4:
3398 case nir_intrinsic_load_ssbo:
3399 case nir_intrinsic_load_input:
3400 case nir_intrinsic_load_interpolated_input:
3401 case nir_intrinsic_load_per_vertex_input:
3402 case nir_intrinsic_store_output:
3403 case nir_intrinsic_store_per_vertex_output:
3404 case nir_intrinsic_store_ssbo:
3405 break;
3406 default:
3407 return false;
3408 }
3409
3410 if (instr->num_components <= 2)
3411 return false;
3412
3413 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3414 if (has_dest) {
3415 if (instr->def.bit_size != 64)
3416 return false;
3417 } else {
3418 if (nir_src_bit_size(instr->src[0]) != 64)
3419 return false;
3420 }
3421
3422 nir_intrinsic_instr *first =
3423 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3424 nir_intrinsic_instr *second =
3425 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3426
3427 switch (instr->intrinsic) {
3428 case nir_intrinsic_load_ubo:
3429 case nir_intrinsic_load_ubo_vec4:
3430 case nir_intrinsic_load_ssbo:
3431 case nir_intrinsic_store_ssbo:
3432 break;
3433
3434 default: {
3435 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3436 semantics.location++;
3437 semantics.num_slots--;
3438 nir_intrinsic_set_io_semantics(second, semantics);
3439
3440 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3441 break;
3442 }
3443 }
3444
3445 first->num_components = 2;
3446 second->num_components -= 2;
3447 if (has_dest) {
3448 first->def.num_components = 2;
3449 second->def.num_components -= 2;
3450 }
3451
3452 nir_builder_instr_insert(b, &first->instr);
3453 nir_builder_instr_insert(b, &second->instr);
3454
3455 if (has_dest) {
3456 /* Merge the two loads' results back into a vector. */
3457 nir_scalar channels[4] = {
3458 nir_get_scalar(&first->def, 0),
3459 nir_get_scalar(&first->def, 1),
3460 nir_get_scalar(&second->def, 0),
3461 nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
3462 };
3463 nir_def *new = nir_vec_scalars(b, channels, instr->num_components);
3464 nir_def_rewrite_uses(&instr->def, new);
3465 } else {
3466 /* Split the src value across the two stores. */
3467 b->cursor = nir_before_instr(&instr->instr);
3468
3469 nir_def *src0 = instr->src[0].ssa;
3470 nir_scalar channels[4] = { 0 };
3471 for (int i = 0; i < instr->num_components; i++)
3472 channels[i] = nir_get_scalar(src0, i);
3473
3474 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3475 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3476
3477 nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
3478 nir_src_rewrite(&second->src[0],
3479 nir_vec_scalars(b, &channels[2], second->num_components));
3480 }
3481
3482 int offset_src = -1;
3483 uint32_t offset_amount = 16;
3484
3485 switch (instr->intrinsic) {
3486 case nir_intrinsic_load_ssbo:
3487 case nir_intrinsic_load_ubo:
3488 offset_src = 1;
3489 break;
3490 case nir_intrinsic_load_ubo_vec4:
3491 offset_src = 1;
3492 offset_amount = 1;
3493 break;
3494 case nir_intrinsic_store_ssbo:
3495 offset_src = 2;
3496 break;
3497 default:
3498 break;
3499 }
3500 if (offset_src != -1) {
3501 b->cursor = nir_before_instr(&second->instr);
3502 nir_def *second_offset =
3503 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3504 nir_src_rewrite(&second->src[offset_src], second_offset);
3505 }
3506
3507 /* DCE stores we generated with no writemask (nothing else does this
3508 * currently).
3509 */
3510 if (!has_dest) {
3511 if (nir_intrinsic_write_mask(first) == 0)
3512 nir_instr_remove(&first->instr);
3513 if (nir_intrinsic_write_mask(second) == 0)
3514 nir_instr_remove(&second->instr);
3515 }
3516
3517 nir_instr_remove(&instr->instr);
3518
3519 return true;
3520 }
3521
3522 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3523 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3524 {
3525 int num_components = instr->def.num_components;
3526
3527 if (instr->def.bit_size != 64 || num_components <= 2)
3528 return false;
3529
3530 b->cursor = nir_before_instr(&instr->instr);
3531
3532 nir_load_const_instr *first =
3533 nir_load_const_instr_create(b->shader, 2, 64);
3534 nir_load_const_instr *second =
3535 nir_load_const_instr_create(b->shader, num_components - 2, 64);
3536
3537 first->value[0] = instr->value[0];
3538 first->value[1] = instr->value[1];
3539 second->value[0] = instr->value[2];
3540 if (num_components == 4)
3541 second->value[1] = instr->value[3];
3542
3543 nir_builder_instr_insert(b, &first->instr);
3544 nir_builder_instr_insert(b, &second->instr);
3545
3546 nir_def *channels[4] = {
3547 nir_channel(b, &first->def, 0),
3548 nir_channel(b, &first->def, 1),
3549 nir_channel(b, &second->def, 0),
3550 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3551 };
3552 nir_def *new = nir_vec(b, channels, num_components);
3553 nir_def_rewrite_uses(&instr->def, new);
3554 nir_instr_remove(&instr->instr);
3555
3556 return true;
3557 }
3558
3559 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3560 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3561 void *data)
3562 {
3563 switch (instr->type) {
3564 case nir_instr_type_load_const:
3565 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3566
3567 case nir_instr_type_intrinsic:
3568 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3569 default:
3570 return false;
3571 }
3572 }
3573
3574 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3575 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3576 {
3577 return nir_shader_instructions_pass(s,
3578 nir_to_tgsi_lower_64bit_to_vec2_instr,
3579 nir_metadata_block_index |
3580 nir_metadata_dominance,
3581 NULL);
3582 }
3583
3584 struct ntt_lower_tex_state {
3585 nir_scalar channels[8];
3586 unsigned i;
3587 };
3588
3589 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3590 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3591 nir_tex_instr *instr,
3592 nir_tex_src_type tex_src_type,
3593 struct ntt_lower_tex_state *s)
3594 {
3595 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3596 if (tex_src < 0)
3597 return;
3598
3599 nir_def *def = instr->src[tex_src].src.ssa;
3600 for (int i = 0; i < def->num_components; i++) {
3601 s->channels[s->i++] = nir_get_scalar(def, i);
3602 }
3603
3604 nir_tex_instr_remove_src(instr, tex_src);
3605 }
3606
3607 /**
3608 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3609 * src. This lets NIR handle the coalescing of the vec4 rather than trying to
3610 * manage it on our own, and may lead to more vectorization.
3611 */
3612 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3613 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3614 {
3615 if (instr->type != nir_instr_type_tex)
3616 return false;
3617
3618 nir_tex_instr *tex = nir_instr_as_tex(instr);
3619
3620 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3621 return false;
3622
3623 b->cursor = nir_before_instr(instr);
3624
3625 struct ntt_lower_tex_state s = {0};
3626
3627 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3628 /* We always have at least two slots for the coordinate, even on 1D. */
3629 s.i = MAX2(s.i, 2);
3630
3631 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3632 s.i = MAX2(s.i, 3);
3633
3634 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3635
3636 /* XXX: LZ */
3637 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3638 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3639 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3640
3641 /* No need to pack undefs in unused channels of the tex instr */
3642 while (!s.channels[s.i - 1].def)
3643 s.i--;
3644
3645 /* Instead of putting undefs in the unused slots of the vecs, just put in
3646 * another used channel. Otherwise, we'll get unnecessary moves into
3647 * registers.
3648 */
3649 assert(s.channels[0].def != NULL);
3650 for (int i = 1; i < s.i; i++) {
3651 if (!s.channels[i].def)
3652 s.channels[i] = s.channels[0];
3653 }
3654
3655 nir_tex_instr_add_src(tex, nir_tex_src_backend1,
3656 nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
3657 if (s.i > 4)
3658 nir_tex_instr_add_src(tex, nir_tex_src_backend2,
3659 nir_vec_scalars(b, &s.channels[4], s.i - 4));
3660
3661 return true;
3662 }
3663
3664 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3665 nir_to_tgsi_lower_tex(nir_shader *s)
3666 {
3667 return nir_shader_instructions_pass(s,
3668 nir_to_tgsi_lower_tex_instr,
3669 nir_metadata_block_index |
3670 nir_metadata_dominance,
3671 NULL);
3672 }
3673
3674 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3675 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3676 const struct nir_to_tgsi_options *ntt_options)
3677 {
3678 const struct nir_shader_compiler_options *options = s->options;
3679 bool lower_fsqrt =
3680 !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3681 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3682
3683 bool force_indirect_unrolling_sampler =
3684 screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3685
3686 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3687
3688 if (!options->lower_extract_byte ||
3689 !options->lower_extract_word ||
3690 !options->lower_insert_byte ||
3691 !options->lower_insert_word ||
3692 !options->lower_fdph ||
3693 !options->lower_flrp64 ||
3694 !options->lower_fmod ||
3695 !options->lower_uadd_carry ||
3696 !options->lower_usub_borrow ||
3697 !options->lower_uadd_sat ||
3698 !options->lower_usub_sat ||
3699 !options->lower_uniforms_to_ubo ||
3700 !options->lower_vector_cmp ||
3701 options->has_rotate8 ||
3702 options->has_rotate16 ||
3703 options->has_rotate32 ||
3704 options->lower_fsqrt != lower_fsqrt ||
3705 options->force_indirect_unrolling != no_indirects_mask ||
3706 force_indirect_unrolling_sampler) {
3707 nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3708 *new_options = *s->options;
3709
3710 new_options->lower_extract_byte = true;
3711 new_options->lower_extract_word = true;
3712 new_options->lower_insert_byte = true;
3713 new_options->lower_insert_word = true;
3714 new_options->lower_fdph = true;
3715 new_options->lower_flrp64 = true;
3716 new_options->lower_fmod = true;
3717 new_options->lower_uadd_carry = true;
3718 new_options->lower_usub_borrow = true;
3719 new_options->lower_uadd_sat = true;
3720 new_options->lower_usub_sat = true;
3721 new_options->lower_uniforms_to_ubo = true;
3722 new_options->lower_vector_cmp = true;
3723 new_options->lower_fsqrt = lower_fsqrt;
3724 new_options->has_rotate8 = false;
3725 new_options->has_rotate16 = false;
3726 new_options->has_rotate32 = false;
3727 new_options->force_indirect_unrolling = no_indirects_mask;
3728 new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3729
3730 s->options = new_options;
3731 }
3732 }
3733
3734 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3735 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3736 {
3737 return (instr->type == nir_instr_type_intrinsic &&
3738 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3739 }
3740
3741 static nir_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3742 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3743 {
3744 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3745
3746 nir_def *old_result = &intr->def;
3747 intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3748
3749 return nir_iadd_imm(b, old_result, -1);
3750 }
3751
3752 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3753 ntt_lower_atomic_pre_dec(nir_shader *s)
3754 {
3755 return nir_shader_lower_instructions(s,
3756 ntt_lower_atomic_pre_dec_filter,
3757 ntt_lower_atomic_pre_dec_lower, NULL);
3758 }
3759
3760 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3761 static void
nir_to_tgsi_lower_txp(nir_shader * s)3762 nir_to_tgsi_lower_txp(nir_shader *s)
3763 {
3764 nir_lower_tex_options lower_tex_options = {
3765 .lower_txp = 0,
3766 };
3767
3768 nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3769 nir_foreach_instr(instr, block) {
3770 if (instr->type != nir_instr_type_tex)
3771 continue;
3772 nir_tex_instr *tex = nir_instr_as_tex(instr);
3773
3774 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3775 continue;
3776
3777 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3778 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3779 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3780
3781 /* We can do TXP for any tex (not txg) where we can fit all the
3782 * coordinates and comparator and projector in one vec4 without any
3783 * other modifiers to add on.
3784 *
3785 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3786 * if we get any funny projectors then we just blow them all away.
3787 */
3788 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3789 lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3790 }
3791 }
3792
3793 /* nir_lower_tex must be run even if no options are set, because we need the
3794 * LOD to be set for query_levels and for non-fragment shaders.
3795 */
3796 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3797 }
3798
3799 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3800 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3801 {
3802 return (instr->type == nir_instr_type_intrinsic &&
3803 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3804 }
3805
3806 static nir_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3807 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3808 {
3809 nir_variable *var = nir_get_variable_with_location(b->shader, nir_var_shader_in,
3810 VARYING_SLOT_PRIMITIVE_ID, glsl_uint_type());
3811
3812 nir_io_semantics semantics = {
3813 .location = var->data.location,
3814 .num_slots = 1
3815 };
3816 return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3817 .base = var->data.driver_location,
3818 .io_semantics = semantics);
3819 }
3820
3821 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3822 nir_lower_primid_sysval_to_input(nir_shader *s)
3823 {
3824 return nir_shader_lower_instructions(s,
3825 nir_lower_primid_sysval_to_input_filter,
3826 nir_lower_primid_sysval_to_input_lower, NULL);
3827 }
3828
3829 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3830 nir_to_tgsi(struct nir_shader *s,
3831 struct pipe_screen *screen)
3832 {
3833 static const struct nir_to_tgsi_options default_ntt_options = {0};
3834 return nir_to_tgsi_options(s, screen, &default_ntt_options);
3835 }
3836
3837 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3838 * into unsupported channels of registers.
3839 */
3840 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3841 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3842 {
3843 if (instr->type != nir_instr_type_alu)
3844 return false;
3845
3846 nir_alu_instr *alu = nir_instr_as_alu(instr);
3847 int dst_32 = alu->def.bit_size == 32;
3848 int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3849
3850 if (src_64 && dst_32) {
3851 int num_srcs = nir_op_infos[alu->op].num_inputs;
3852
3853 if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3854 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3855 * instead of .xy. Just support scalar compares storing to .x,
3856 * GLSL-to-TGSI only ever emitted scalar ops anyway.
3857 */
3858 if (writemask != TGSI_WRITEMASK_X)
3859 return false;
3860 } else {
3861 /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3862 * register can only store a dvec2). Don't try to coalesce to write to
3863 * .zw.
3864 */
3865 if (writemask & ~(TGSI_WRITEMASK_XY))
3866 return false;
3867 }
3868 }
3869
3870 return true;
3871 }
3872
3873 /**
3874 * Translates the NIR shader to TGSI.
3875 *
3876 * This requires some lowering of the NIR shader to prepare it for translation.
3877 * We take ownership of the NIR shader passed, returning a reference to the new
3878 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
3879 */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3880 const void *nir_to_tgsi_options(struct nir_shader *s,
3881 struct pipe_screen *screen,
3882 const struct nir_to_tgsi_options *options)
3883 {
3884 struct ntt_compile *c;
3885 const void *tgsi_tokens;
3886 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3887 bool native_integers = screen->get_shader_param(screen,
3888 pipe_shader_type_from_mesa(s->info.stage),
3889 PIPE_SHADER_CAP_INTEGERS);
3890 const struct nir_shader_compiler_options *original_options = s->options;
3891
3892 ntt_fix_nir_options(screen, s, options);
3893
3894 /* Lower array indexing on FS inputs. Since we don't set
3895 * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3896 * elements by ureg, and so dynamically indexing them would be invalid.
3897 * Ideally we would set that ureg flag based on
3898 * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3899 * splitting NIR VS outputs to elements even if the FS doesn't get the
3900 * corresponding splitting, and virgl depends on TGSI across link boundaries
3901 * having matching declarations.
3902 */
3903 if (s->info.stage == MESA_SHADER_FRAGMENT) {
3904 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3905 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3906 }
3907
3908 /* Lower tesslevel indirect derefs for tessellation shader.
3909 * tesslevels are now a compact array variable and nir expects a constant
3910 * array index into the compact array variable.
3911 */
3912 if (s->info.stage == MESA_SHADER_TESS_CTRL ||
3913 s->info.stage == MESA_SHADER_TESS_EVAL) {
3914 NIR_PASS_V(s, nir_lower_indirect_derefs, 0 , UINT32_MAX);
3915 }
3916
3917 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3918 type_size, (nir_lower_io_options)0);
3919
3920 nir_to_tgsi_lower_txp(s);
3921 NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3922
3923 /* While TGSI can represent PRIMID as either an input or a system value,
3924 * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3925 * depend on that.
3926 */
3927 if (s->info.stage == MESA_SHADER_GEOMETRY)
3928 NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3929
3930 if (s->info.num_abos)
3931 NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3932
3933 if (!original_options->lower_uniforms_to_ubo) {
3934 NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3935 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3936 !native_integers);
3937 }
3938
3939 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3940 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3941 * duplication logic we just make it so that we only see vec2s.
3942 */
3943 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3944 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3945
3946 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3947 NIR_PASS_V(s, nir_lower_ubo_vec4);
3948
3949 ntt_optimize_nir(s, screen, options);
3950
3951 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3952
3953 /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
3954 NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
3955
3956 NIR_PASS_V(s, nir_lower_frexp);
3957
3958 bool progress;
3959 do {
3960 progress = false;
3961 NIR_PASS(progress, s, nir_opt_algebraic_late);
3962 if (progress) {
3963 NIR_PASS_V(s, nir_copy_prop);
3964 NIR_PASS_V(s, nir_opt_dce);
3965 NIR_PASS_V(s, nir_opt_cse);
3966 }
3967 } while (progress);
3968
3969 NIR_PASS_V(s, nir_opt_combine_barriers, NULL, NULL);
3970
3971 if (screen->get_shader_param(screen,
3972 pipe_shader_type_from_mesa(s->info.stage),
3973 PIPE_SHADER_CAP_INTEGERS)) {
3974 NIR_PASS_V(s, nir_lower_bool_to_int32);
3975 } else {
3976 NIR_PASS_V(s, nir_lower_int_to_float);
3977 NIR_PASS_V(s, nir_lower_bool_to_float,
3978 !options->lower_cmp && !options->lower_fabs);
3979 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3980 NIR_PASS_V(s, nir_copy_prop);
3981 NIR_PASS_V(s, nir_opt_dce);
3982 }
3983
3984 nir_move_options move_all =
3985 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3986 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3987
3988 NIR_PASS_V(s, nir_opt_move, move_all);
3989
3990 NIR_PASS_V(s, nir_convert_from_ssa, true);
3991 NIR_PASS_V(s, nir_lower_vec_to_regs, ntt_vec_to_mov_writemask_cb, NULL);
3992
3993 /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
3994 */
3995 NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
3996 NIR_PASS_V(s, nir_opt_dce);
3997
3998 /* See comment in ntt_get_alu_src for supported modifiers */
3999 NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
4000
4001 if (NIR_DEBUG(TGSI)) {
4002 fprintf(stderr, "NIR before translation to TGSI:\n");
4003 nir_print_shader(s, stderr);
4004 }
4005
4006 c = rzalloc(NULL, struct ntt_compile);
4007 c->screen = screen;
4008 c->options = options;
4009
4010 c->needs_texcoord_semantic =
4011 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
4012 c->has_txf_lz =
4013 screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
4014
4015 c->s = s;
4016 c->native_integers = native_integers;
4017 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
4018 ureg_setup_shader_info(c->ureg, &s->info);
4019 if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
4020 ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
4021
4022 if (s->info.stage == MESA_SHADER_FRAGMENT) {
4023 /* The draw module's polygon stipple layer doesn't respect the chosen
4024 * coordinate mode, so leave it as unspecified unless we're actually
4025 * reading the position in the shader already. See
4026 * gl-2.1-polygon-stipple-fs on softpipe.
4027 */
4028 if ((s->info.inputs_read & VARYING_BIT_POS) ||
4029 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4030 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
4031 s->info.fs.origin_upper_left ?
4032 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
4033 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4034
4035 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
4036 s->info.fs.pixel_center_integer ?
4037 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
4038 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
4039 }
4040 }
4041 /* Emit the main function */
4042 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
4043 ntt_emit_impl(c, impl);
4044 ureg_END(c->ureg);
4045
4046 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
4047
4048 if (NIR_DEBUG(TGSI)) {
4049 fprintf(stderr, "TGSI after translation from NIR:\n");
4050 tgsi_dump(tgsi_tokens, 0);
4051 }
4052
4053 ureg_destroy(c->ureg);
4054
4055 ralloc_free(c);
4056 ralloc_free(s);
4057
4058 return tgsi_tokens;
4059 }
4060
4061 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
4062 .fdot_replicates = true,
4063 .fuse_ffma32 = true,
4064 .fuse_ffma64 = true,
4065 .lower_extract_byte = true,
4066 .lower_extract_word = true,
4067 .lower_insert_byte = true,
4068 .lower_insert_word = true,
4069 .lower_fdph = true,
4070 .lower_flrp64 = true,
4071 .lower_fmod = true,
4072 .lower_uniforms_to_ubo = true,
4073 .lower_uadd_carry = true,
4074 .lower_usub_borrow = true,
4075 .lower_uadd_sat = true,
4076 .lower_usub_sat = true,
4077 .lower_vector_cmp = true,
4078 .lower_int64_options = nir_lower_imul_2x32_64,
4079 .use_interpolated_input_intrinsics = true,
4080
4081 /* TGSI doesn't have a semantic for local or global index, just local and
4082 * workgroup id.
4083 */
4084 .lower_cs_local_index_to_id = true,
4085 };
4086
4087 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
4088 * NIR support.
4089 */
4090 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)4091 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
4092 enum pipe_shader_ir ir,
4093 unsigned shader)
4094 {
4095 assert(ir == PIPE_SHADER_IR_NIR);
4096 return &nir_to_tgsi_compiler_options;
4097 }
4098
4099 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
4100 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)4101 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
4102 const struct pipe_shader_state *cso)
4103 {
4104 if (cso->type == PIPE_SHADER_IR_NIR) {
4105 return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
4106 } else {
4107 assert(cso->type == PIPE_SHADER_IR_TGSI);
4108 /* we need to keep a local copy of the tokens */
4109 return tgsi_dup_tokens(cso->tokens);
4110 }
4111 }
4112