1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_worklist.h"
27 #include "nir/nir_to_tgsi.h"
28 #include "pipe/p_screen.h"
29 #include "pipe/p_state.h"
30 #include "tgsi/tgsi_dump.h"
31 #include "tgsi/tgsi_from_mesa.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_ureg.h"
35 #include "tgsi/tgsi_util.h"
36 #include "util/debug.h"
37 #include "util/u_math.h"
38 #include "util/u_memory.h"
39 #include "util/u_dynarray.h"
40
41 struct ntt_insn {
42 enum tgsi_opcode opcode;
43 struct ureg_dst dst[2];
44 struct ureg_src src[4];
45 enum tgsi_texture_type tex_target;
46 enum tgsi_return_type tex_return_type;
47 struct tgsi_texture_offset tex_offset;
48
49 unsigned mem_qualifier;
50 enum pipe_format mem_format;
51
52 bool is_tex : 1;
53 bool is_mem : 1;
54 bool precise : 1;
55 };
56
57 struct ntt_block {
58 /* Array of struct ntt_insn */
59 struct util_dynarray insns;
60 int start_ip;
61 int end_ip;
62 };
63
64 struct ntt_reg_interval {
65 uint32_t start, end;
66 };
67
68 struct ntt_compile {
69 nir_shader *s;
70 nir_function_impl *impl;
71 const struct nir_to_tgsi_options *options;
72 struct pipe_screen *screen;
73 struct ureg_program *ureg;
74
75 bool needs_texcoord_semantic;
76 bool native_integers;
77 bool has_txf_lz;
78
79 bool addr_declared[3];
80 struct ureg_dst addr_reg[3];
81
82 /* if condition set up at the end of a block, for ntt_emit_if(). */
83 struct ureg_src if_cond;
84
85 /* TGSI temps for our NIR SSA and register values. */
86 struct ureg_dst *reg_temp;
87 struct ureg_src *ssa_temp;
88
89 struct ntt_reg_interval *liveness;
90
91 /* Map from nir_block to ntt_block */
92 struct hash_table *blocks;
93 struct ntt_block *cur_block;
94 unsigned current_if_else;
95 unsigned cf_label;
96
97 /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
98 bool precise;
99
100 unsigned num_temps;
101 unsigned first_non_array_temp;
102
103 /* Mappings from driver_location to TGSI input/output number.
104 *
105 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
106 * their numbers assigned incrementally, unlike inputs or constants.
107 */
108 struct ureg_src *input_index_map;
109 uint64_t centroid_inputs;
110
111 uint32_t first_ubo;
112
113 struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
114 };
115
116 static struct ureg_dst
ntt_temp(struct ntt_compile * c)117 ntt_temp(struct ntt_compile *c)
118 {
119 return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
120 }
121
122 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)123 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
124 {
125 struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
126 return entry->data;
127 }
128
129 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
130 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
131
132 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)133 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
134 struct ureg_dst dst,
135 struct ureg_src src0, struct ureg_src src1,
136 struct ureg_src src2, struct ureg_src src3)
137 {
138 struct ntt_insn insn = {
139 .opcode = opcode,
140 .dst = { dst, ureg_dst_undef() },
141 .src = { src0, src1, src2, src3 },
142 .precise = c->precise,
143 };
144 util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
145 return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
146 }
147
148 #define OP00( op ) \
149 static inline void ntt_##op(struct ntt_compile *c) \
150 { \
151 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
152 }
153
154 #define OP01( op ) \
155 static inline void ntt_##op(struct ntt_compile *c, \
156 struct ureg_src src0) \
157 { \
158 ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
159 }
160
161
162 #define OP10( op ) \
163 static inline void ntt_##op(struct ntt_compile *c, \
164 struct ureg_dst dst) \
165 { \
166 ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
167 }
168
169 #define OP11( op ) \
170 static inline void ntt_##op(struct ntt_compile *c, \
171 struct ureg_dst dst, \
172 struct ureg_src src0) \
173 { \
174 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
175 }
176
177 #define OP12( op ) \
178 static inline void ntt_##op(struct ntt_compile *c, \
179 struct ureg_dst dst, \
180 struct ureg_src src0, \
181 struct ureg_src src1) \
182 { \
183 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
184 }
185
186 #define OP13( op ) \
187 static inline void ntt_##op(struct ntt_compile *c, \
188 struct ureg_dst dst, \
189 struct ureg_src src0, \
190 struct ureg_src src1, \
191 struct ureg_src src2) \
192 { \
193 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
194 }
195
196 #define OP14( op ) \
197 static inline void ntt_##op(struct ntt_compile *c, \
198 struct ureg_dst dst, \
199 struct ureg_src src0, \
200 struct ureg_src src1, \
201 struct ureg_src src2, \
202 struct ureg_src src3) \
203 { \
204 ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
205 }
206
207 /* We hand-craft our tex instructions */
208 #define OP12_TEX(op)
209 #define OP14_TEX(op)
210
211 /* Use a template include to generate a correctly-typed ntt_OP()
212 * function for each TGSI opcode:
213 */
214 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
215
216 /**
217 * Interprets a nir_load_const used as a NIR src as a uint.
218 *
219 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
220 * instruction (or in a phi-web used by an integer ALU instruction) were
221 * converted to floats and the ALU instruction swapped to the float equivalent.
222 * However, this means that integer load_consts used by intrinsics (which don't
223 * normally get that conversion) may have been reformatted to be floats. Given
224 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
225 * we can just look and see if they look like floats and convert them back to
226 * ints.
227 */
228 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)229 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
230 {
231 uint32_t val = nir_src_as_uint(src);
232 if (!c->native_integers && val >= fui(1.0))
233 val = (uint32_t)uif(val);
234 return val;
235 }
236
237 static unsigned
ntt_64bit_write_mask(unsigned write_mask)238 ntt_64bit_write_mask(unsigned write_mask)
239 {
240 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
241 }
242
243 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)244 ntt_64bit_1f(struct ntt_compile *c)
245 {
246 return ureg_imm4u(c->ureg,
247 0x00000000, 0x3ff00000,
248 0x00000000, 0x3ff00000);
249 }
250
251 /* Per-channel masks of def/use within the block, and the per-channel
252 * livein/liveout for the block as a whole.
253 */
254 struct ntt_live_reg_block_state {
255 uint8_t *def, *use, *livein, *liveout, *defin, *defout;
256 };
257
258 struct ntt_live_reg_state {
259 unsigned bitset_words;
260
261 struct ntt_reg_interval *regs;
262
263 /* Used in propagate_across_edge() */
264 BITSET_WORD *tmp_live;
265
266 struct ntt_live_reg_block_state *blocks;
267
268 nir_block_worklist worklist;
269 };
270
271 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)272 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
273 int ip, unsigned index, unsigned used_mask)
274 {
275 bs->use[index] |= used_mask & ~bs->def[index];
276
277 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
278 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
279
280 }
281 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)282 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
283 {
284 for (int i = 0; i < impl->num_blocks; i++) {
285 state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
286 state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
287 state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288 state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289 state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
290 state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291 }
292
293 int ip = 0;
294 nir_foreach_block(block, impl) {
295 struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
296 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
297
298 ntt_block->start_ip = ip;
299
300 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
301 const struct tgsi_opcode_info *opcode_info =
302 tgsi_get_opcode_info(insn->opcode);
303
304 /* Set up use[] for the srcs.
305 *
306 * Uses are the channels of the reg read in the block that don't have a
307 * preceding def to screen them off. Note that we don't do per-element
308 * tracking of array regs, so they're never screened off.
309 */
310 for (int i = 0; i < opcode_info->num_src; i++) {
311 if (insn->src[i].File != TGSI_FILE_TEMPORARY)
312 continue;
313 int index = insn->src[i].Index;
314
315 uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
316 insn->dst->WriteMask,
317 insn->src[i].SwizzleX,
318 insn->src[i].SwizzleY,
319 insn->src[i].SwizzleZ,
320 insn->src[i].SwizzleW,
321 insn->tex_target,
322 insn->tex_target);
323
324 assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
325 ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
326 }
327
328 if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY)
329 ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset.Index, 0xf);
330
331 /* Set up def[] for the srcs.
332 *
333 * Defs are the unconditionally-written (not R/M/W) channels of the reg in
334 * the block that don't have a preceding use.
335 */
336 for (int i = 0; i < opcode_info->num_dst; i++) {
337 if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
338 continue;
339 int index = insn->dst[i].Index;
340 uint32_t writemask = insn->dst[i].WriteMask;
341
342 bs->def[index] |= writemask & ~bs->use[index];
343 bs->defout[index] |= writemask;
344
345 assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
346 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
347 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
348 }
349 ip++;
350 }
351
352 ntt_block->end_ip = ip;
353 }
354 }
355
356 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)357 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
358 {
359 nir_metadata_require(impl, nir_metadata_block_index);
360
361 c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
362
363 struct ntt_live_reg_state state = {
364 .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
365 };
366
367 /* The intervals start out with start > end (indicating unused) */
368 for (int i = 0; i < c->num_temps; i++)
369 c->liveness[i].start = ~0;
370
371 ntt_live_reg_setup_def_use(c, impl, &state);
372
373 /* Make a forward-order worklist of all the blocks. */
374 nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
375 nir_foreach_block(block, impl) {
376 nir_block_worklist_push_tail(&state.worklist, block);
377 }
378
379 /* Propagate defin/defout down the CFG to calculate the live variables
380 * potentially defined along any possible control flow path. We'll use this
381 * to keep things like conditional defs of the reg (or array regs where we
382 * don't track defs!) from making the reg's live range extend back to the
383 * start of the program.
384 */
385 while (!nir_block_worklist_is_empty(&state.worklist)) {
386 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
387 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
388 nir_block *succ = block->successors[j];
389 if (!succ || succ->index == impl->num_blocks)
390 continue;
391
392 for (int i = 0; i < c->num_temps; i++) {
393 uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
394
395 if (new_def) {
396 state.blocks[succ->index].defin[i] |= new_def;
397 state.blocks[succ->index].defout[i] |= new_def;
398 nir_block_worklist_push_tail(&state.worklist, succ);
399 }
400 }
401 }
402 }
403
404 /* Make a reverse-order worklist of all the blocks. */
405 nir_foreach_block(block, impl) {
406 nir_block_worklist_push_head(&state.worklist, block);
407 }
408
409 /* We're now ready to work through the worklist and update the liveness sets
410 * of each of the blocks. As long as we keep the worklist up-to-date as we
411 * go, everything will get covered.
412 */
413 while (!nir_block_worklist_is_empty(&state.worklist)) {
414 /* We pop them off in the reverse order we pushed them on. This way
415 * the first walk of the instructions is backwards so we only walk
416 * once in the case of no control flow.
417 */
418 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
419 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
420 struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
421
422 for (int i = 0; i < c->num_temps; i++) {
423 /* Collect livein from our successors to include in our liveout. */
424 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
425 nir_block *succ = block->successors[j];
426 if (!succ || succ->index == impl->num_blocks)
427 continue;
428 struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
429
430 uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
431 if (new_liveout) {
432 if (state.blocks[block->index].defout[i])
433 c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
434 bs->liveout[i] |= sbs->livein[i];
435 }
436 }
437
438 /* Propagate use requests from either our block's uses or our
439 * non-screened-off liveout up to our predecessors.
440 */
441 uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
442 ~bs->livein[i]);
443 if (new_livein) {
444 bs->livein[i] |= new_livein;
445 set_foreach(block->predecessors, entry) {
446 nir_block *pred = (void *)entry->key;
447 nir_block_worklist_push_tail(&state.worklist, pred);
448 }
449
450 if (new_livein & state.blocks[block->index].defin[i])
451 c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
452 }
453 }
454 }
455
456 ralloc_free(state.blocks);
457 nir_block_worklist_fini(&state.worklist);
458 }
459
460 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)461 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
462 {
463 if (index < c->first_non_array_temp)
464 return;
465
466 if (c->liveness[index].start == ip && ra_map[index] == ~0)
467 ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
468
469 if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
470 ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
471 BITSET_SET(released, index);
472 }
473 }
474
475 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)476 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
477 {
478 ntt_live_regs(c, impl);
479
480 unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
481 unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
482
483 /* No RA on NIR array regs */
484 for (int i = 0; i < c->first_non_array_temp; i++)
485 ra_map[i] = i;
486
487 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
488 ra_map[i] = ~0;
489
490 int ip = 0;
491 nir_foreach_block(block, impl) {
492 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
493
494 for (int i = 0; i < c->num_temps; i++)
495 ntt_ra_check(c, ra_map, released, ip, i);
496
497 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
498 const struct tgsi_opcode_info *opcode_info =
499 tgsi_get_opcode_info(insn->opcode);
500
501 for (int i = 0; i < opcode_info->num_src; i++) {
502 if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
503 ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
504 insn->src[i].Index = ra_map[insn->src[i].Index];
505 }
506 }
507
508 if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) {
509 ntt_ra_check(c, ra_map, released, ip, insn->tex_offset.Index);
510 insn->tex_offset.Index = ra_map[insn->tex_offset.Index];
511 }
512
513 for (int i = 0; i < opcode_info->num_dst; i++) {
514 if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
515 ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
516 insn->dst[i].Index = ra_map[insn->dst[i].Index];
517 }
518 }
519 ip++;
520 }
521
522 for (int i = 0; i < c->num_temps; i++)
523 ntt_ra_check(c, ra_map, released, ip, i);
524 }
525 }
526
527 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)528 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
529 {
530 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
531 ureg_DECL_temporary(c->ureg);
532 }
533
534
535 /**
536 * Try to find an iadd of a constant value with a non-constant value in the
537 * nir_src's first component, returning the constant offset and replacing *src
538 * with the non-constant component.
539 */
540 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)541 ntt_extract_const_src_offset(nir_src *src)
542 {
543 if (!src->is_ssa)
544 return 0;
545
546 nir_ssa_scalar s = nir_get_ssa_scalar(src->ssa, 0);
547
548 while (nir_ssa_scalar_is_alu(s)) {
549 nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
550
551 for (int i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
552 if (!alu->src[i].src.is_ssa)
553 return 0;
554 }
555
556 if (alu->op == nir_op_iadd) {
557 for (int i = 0; i < 2; i++) {
558 nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
559 if (v && !alu->src[i].negate && !alu->src[i].abs) {
560 *src = alu->src[1 - i].src;
561 return v[alu->src[i].swizzle[s.comp]].u32;
562 }
563 }
564
565 return 0;
566 }
567
568 /* We'd like to reuse nir_ssa_scalar_chase_movs(), but it assumes SSA and that
569 * seems reasonable for something used in inner loops of the compiler.
570 */
571 if (!nir_alu_instr_is_copy(alu))
572 return 0;
573
574 if (alu->op == nir_op_mov) {
575 s.def = alu->src[0].src.ssa;
576 s.comp = alu->src[0].swizzle[s.comp];
577 } else if (nir_op_is_vec(alu->op)) {
578 s.def = alu->src[s.comp].src.ssa;
579 s.comp = alu->src[s.comp].swizzle[0];
580 } else {
581 return 0;
582 }
583 }
584
585 return 0;
586 }
587
588 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)589 ntt_shader_input_type(struct ntt_compile *c,
590 struct nir_variable *var)
591 {
592 switch (c->s->info.stage) {
593 case MESA_SHADER_GEOMETRY:
594 case MESA_SHADER_TESS_EVAL:
595 case MESA_SHADER_TESS_CTRL:
596 if (glsl_type_is_array(var->type))
597 return glsl_get_array_element(var->type);
598 else
599 return var->type;
600 default:
601 return var->type;
602 }
603 }
604
605 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)606 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
607 unsigned *semantic_name, unsigned *semantic_index)
608 {
609 /* We want to use most of tgsi_get_gl_varying_semantic(), but the
610 * !texcoord shifting has already been applied, so avoid that.
611 */
612 if (!c->needs_texcoord_semantic &&
613 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
614 *semantic_name = TGSI_SEMANTIC_GENERIC;
615 *semantic_index = location - VARYING_SLOT_VAR0;
616 return;
617 }
618
619 tgsi_get_gl_varying_semantic(location, true,
620 semantic_name, semantic_index);
621 }
622
623 /* TGSI varying declarations have a component usage mask associated (used by
624 * r600 and svga).
625 */
626 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)627 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
628 bool is_64)
629 {
630 uint32_t usage_mask =
631 u_bit_consecutive(start_component, num_components);
632
633 if (is_64) {
634 if (start_component >= 2)
635 usage_mask >>= 2;
636
637 uint32_t tgsi_usage_mask = 0;
638
639 if (usage_mask & TGSI_WRITEMASK_X)
640 tgsi_usage_mask |= TGSI_WRITEMASK_XY;
641 if (usage_mask & TGSI_WRITEMASK_Y)
642 tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
643
644 return tgsi_usage_mask;
645 } else {
646 return usage_mask;
647 }
648 }
649
650 /* TGSI varying declarations have a component usage mask associated (used by
651 * r600 and svga).
652 */
653 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)654 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
655 {
656 const struct glsl_type *type_without_array =
657 glsl_without_array(var->type);
658 unsigned num_components = glsl_get_vector_elements(type_without_array);
659 if (num_components == 0) /* structs */
660 num_components = 4;
661
662 return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
663 glsl_type_is_64bit(type_without_array));
664 }
665
666 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)667 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
668 {
669 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
670 int base = nir_intrinsic_base(instr);
671 *frac = nir_intrinsic_component(instr);
672 bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
673
674 struct ureg_dst out;
675 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
676 unsigned semantic_name, semantic_index;
677 tgsi_get_gl_frag_result_semantic(semantics.location,
678 &semantic_name, &semantic_index);
679 semantic_index += semantics.dual_source_blend_index;
680
681 switch (semantics.location) {
682 case FRAG_RESULT_DEPTH:
683 *frac = 2; /* z write is the to the .z channel in TGSI */
684 break;
685 case FRAG_RESULT_STENCIL:
686 *frac = 1;
687 break;
688 default:
689 break;
690 }
691
692 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
693 } else {
694 unsigned semantic_name, semantic_index;
695
696 ntt_get_gl_varying_semantic(c, semantics.location,
697 &semantic_name, &semantic_index);
698
699 uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
700 instr->num_components,
701 is_64);
702 uint32_t gs_streams = semantics.gs_streams;
703 for (int i = 0; i < 4; i++) {
704 if (!(usage_mask & (1 << i)))
705 gs_streams &= ~(0x3 << 2 * i);
706 }
707
708 /* No driver appears to use array_id of outputs. */
709 unsigned array_id = 0;
710
711 /* This bit is lost in the i/o semantics, but it's unused in in-tree
712 * drivers.
713 */
714 bool invariant = semantics.invariant;
715
716 out = ureg_DECL_output_layout(c->ureg,
717 semantic_name, semantic_index,
718 gs_streams,
719 base,
720 usage_mask,
721 array_id,
722 semantics.num_slots,
723 invariant);
724 }
725
726 unsigned write_mask;
727 if (nir_intrinsic_has_write_mask(instr))
728 write_mask = nir_intrinsic_write_mask(instr);
729 else
730 write_mask = ((1 << instr->num_components) - 1) << *frac;
731
732 if (is_64) {
733 write_mask = ntt_64bit_write_mask(write_mask);
734 if (*frac >= 2)
735 write_mask = write_mask << 2;
736 } else {
737 write_mask = write_mask << *frac;
738 }
739 return ureg_writemask(out, write_mask);
740 }
741
742 /* If this reg or SSA def is used only for storing an output, then in the simple
743 * cases we can write directly to the TGSI output instead of having store_output
744 * emit its own MOV.
745 */
746 static bool
ntt_try_store_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,struct list_head * uses,struct list_head * if_uses)747 ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
748 struct list_head *uses, struct list_head *if_uses)
749 {
750 *dst = ureg_dst_undef();
751
752 switch (c->s->info.stage) {
753 case MESA_SHADER_FRAGMENT:
754 case MESA_SHADER_VERTEX:
755 break;
756 default:
757 /* tgsi_exec (at least) requires that output stores happen per vertex
758 * emitted, you don't get to reuse a previous output value for the next
759 * vertex.
760 */
761 return false;
762 }
763
764 if (!list_is_empty(if_uses) || !list_is_singular(uses))
765 return false;
766
767 nir_src *src = list_first_entry(uses, nir_src, use_link);
768
769 if (src->parent_instr->type != nir_instr_type_intrinsic)
770 return false;
771
772 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
773 if (intr->intrinsic != nir_intrinsic_store_output ||
774 !nir_src_is_const(intr->src[1])) {
775 return false;
776 }
777
778 uint32_t frac;
779 *dst = ntt_output_decl(c, intr, &frac);
780 dst->Index += ntt_src_as_uint(c, intr->src[1]);
781
782 return frac == 0;
783 }
784
785 static void
ntt_setup_inputs(struct ntt_compile * c)786 ntt_setup_inputs(struct ntt_compile *c)
787 {
788 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
789 return;
790
791 unsigned num_inputs = 0;
792 int num_input_arrays = 0;
793
794 nir_foreach_shader_in_variable(var, c->s) {
795 const struct glsl_type *type = ntt_shader_input_type(c, var);
796 unsigned array_len =
797 glsl_count_attribute_slots(type, false);
798
799 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
800 }
801
802 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
803
804 nir_foreach_shader_in_variable(var, c->s) {
805 const struct glsl_type *type = ntt_shader_input_type(c, var);
806 unsigned array_len =
807 glsl_count_attribute_slots(type, false);
808
809 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
810 unsigned sample_loc;
811 struct ureg_src decl;
812
813 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
814 interpolation =
815 tgsi_get_interp_mode(var->data.interpolation,
816 var->data.location == VARYING_SLOT_COL0 ||
817 var->data.location == VARYING_SLOT_COL1);
818
819 if (var->data.location == VARYING_SLOT_POS)
820 interpolation = TGSI_INTERPOLATE_LINEAR;
821 }
822
823 unsigned semantic_name, semantic_index;
824 ntt_get_gl_varying_semantic(c, var->data.location,
825 &semantic_name, &semantic_index);
826
827 if (var->data.sample) {
828 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
829 } else if (var->data.centroid) {
830 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
831 c->centroid_inputs |= (BITSET_MASK(array_len) <<
832 var->data.driver_location);
833 } else {
834 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
835 }
836
837 unsigned array_id = 0;
838 if (glsl_type_is_array(type))
839 array_id = ++num_input_arrays;
840
841 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
842
843 decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
844 semantic_name,
845 semantic_index,
846 interpolation,
847 sample_loc,
848 var->data.driver_location,
849 usage_mask,
850 array_id, array_len);
851
852 if (semantic_name == TGSI_SEMANTIC_FACE) {
853 struct ureg_dst temp = ntt_temp(c);
854 if (c->native_integers) {
855 /* NIR is ~0 front and 0 back, while TGSI is +1 front */
856 ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
857 } else {
858 /* tgsi docs say that floating point FACE will be positive for
859 * frontface and negative for backface, but realistically
860 * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
861 * Copy that behavior, since some drivers (r300) have been doing a
862 * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
863 * front face).
864 */
865 temp.Saturate = true;
866 ntt_MOV(c, temp, decl);
867
868 }
869 decl = ureg_src(temp);
870 }
871
872 for (unsigned i = 0; i < array_len; i++) {
873 c->input_index_map[var->data.driver_location + i] = decl;
874 c->input_index_map[var->data.driver_location + i].Index += i;
875 }
876 }
877 }
878
879 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)880 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
881 {
882 return a->data.location - b->data.location;
883 }
884
885 /**
886 * Workaround for virglrenderer requiring that TGSI FS output color variables
887 * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
888 */
889 static void
ntt_setup_outputs(struct ntt_compile * c)890 ntt_setup_outputs(struct ntt_compile *c)
891 {
892 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
893 return;
894
895 nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
896
897 nir_foreach_shader_out_variable(var, c->s) {
898 if (var->data.location == FRAG_RESULT_COLOR)
899 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
900
901 unsigned semantic_name, semantic_index;
902 tgsi_get_gl_frag_result_semantic(var->data.location,
903 &semantic_name, &semantic_index);
904
905 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
906 }
907 }
908
909 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)910 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
911 {
912 switch (dim) {
913 case GLSL_SAMPLER_DIM_1D:
914 if (is_shadow)
915 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
916 else
917 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
918 case GLSL_SAMPLER_DIM_2D:
919 case GLSL_SAMPLER_DIM_EXTERNAL:
920 if (is_shadow)
921 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
922 else
923 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
924 case GLSL_SAMPLER_DIM_3D:
925 return TGSI_TEXTURE_3D;
926 case GLSL_SAMPLER_DIM_CUBE:
927 if (is_shadow)
928 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
929 else
930 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
931 case GLSL_SAMPLER_DIM_RECT:
932 if (is_shadow)
933 return TGSI_TEXTURE_SHADOWRECT;
934 else
935 return TGSI_TEXTURE_RECT;
936 case GLSL_SAMPLER_DIM_MS:
937 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
938 case GLSL_SAMPLER_DIM_BUF:
939 return TGSI_TEXTURE_BUFFER;
940 default:
941 unreachable("unknown sampler dim");
942 }
943 }
944
945 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)946 tgsi_return_type_from_base_type(enum glsl_base_type type)
947 {
948 switch (type) {
949 case GLSL_TYPE_INT:
950 return TGSI_RETURN_TYPE_SINT;
951 case GLSL_TYPE_UINT:
952 return TGSI_RETURN_TYPE_UINT;
953 case GLSL_TYPE_FLOAT:
954 return TGSI_RETURN_TYPE_FLOAT;
955 default:
956 unreachable("unexpected texture type");
957 }
958 }
959
960 static void
ntt_setup_uniforms(struct ntt_compile * c)961 ntt_setup_uniforms(struct ntt_compile *c)
962 {
963 nir_foreach_uniform_variable(var, c->s) {
964 if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
965 glsl_type_is_texture(glsl_without_array(var->type))) {
966 /* Don't use this size for the check for samplers -- arrays of structs
967 * containing samplers should be ignored, and just the separate lowered
968 * sampler uniform decl used.
969 */
970 int size = glsl_type_get_sampler_count(var->type) +
971 glsl_type_get_texture_count(var->type);
972
973 const struct glsl_type *stype = glsl_without_array(var->type);
974 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
975 glsl_sampler_type_is_array(stype),
976 glsl_sampler_type_is_shadow(stype));
977 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
978 for (int i = 0; i < size; i++) {
979 ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
980 target, ret_type, ret_type, ret_type, ret_type);
981 ureg_DECL_sampler(c->ureg, var->data.binding + i);
982 }
983 } else if (glsl_contains_atomic(var->type)) {
984 uint32_t offset = var->data.offset / 4;
985 uint32_t size = glsl_atomic_size(var->type) / 4;
986 ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
987 }
988
989 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
990 * size declaration happens with other UBOs below.
991 */
992 }
993
994 nir_foreach_image_variable(var, c->s) {
995 int image_count = glsl_type_get_image_count(var->type);
996 const struct glsl_type *itype = glsl_without_array(var->type);
997 enum tgsi_texture_type tex_type =
998 tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
999 glsl_sampler_type_is_array(itype), false);
1000
1001 for (int i = 0; i < image_count; i++) {
1002 c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1003 var->data.binding + i,
1004 tex_type,
1005 var->data.image.format,
1006 !(var->data.access & ACCESS_NON_WRITEABLE),
1007 false);
1008 }
1009 }
1010
1011 c->first_ubo = ~0;
1012
1013 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1014 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1015 int ubo = var->data.driver_location;
1016 if (ubo == -1)
1017 continue;
1018
1019 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1020 c->first_ubo = MIN2(c->first_ubo, ubo);
1021
1022 unsigned size = glsl_get_explicit_size(var->interface_type, false);
1023
1024 int array_size = 1;
1025 if (glsl_type_is_interface(glsl_without_array(var->type)))
1026 array_size = MAX2(1, glsl_get_aoa_size(var->type));
1027
1028 for (int i = 0; i < array_size; i++) {
1029 /* Even if multiple NIR variables are in the same uniform block, their
1030 * explicit size is the size of the block.
1031 */
1032 if (ubo_sizes[ubo + i])
1033 assert(ubo_sizes[ubo + i] == size);
1034
1035 ubo_sizes[ubo + i] = size;
1036 }
1037 }
1038
1039 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1040 if (ubo_sizes[i])
1041 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1042 }
1043
1044 for (int i = 0; i < c->s->info.num_ssbos; i++) {
1045 /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1046 * counters
1047 */
1048 bool atomic = false;
1049 ureg_DECL_buffer(c->ureg, i, atomic);
1050 }
1051 }
1052
1053 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)1054 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
1055 {
1056 assert(c->num_temps == 0);
1057 /* Permanently allocate all the array regs at the start. */
1058 foreach_list_typed(nir_register, nir_reg, node, list) {
1059 if (nir_reg->num_array_elems != 0) {
1060 struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true);
1061 c->reg_temp[nir_reg->index] = decl;
1062 assert(c->num_temps == decl.Index);
1063 c->num_temps += nir_reg->num_array_elems;
1064 }
1065 }
1066 c->first_non_array_temp = c->num_temps;
1067
1068 /* After that, allocate non-array regs in our virtual space that we'll
1069 * register-allocate before ureg emit.
1070 */
1071 foreach_list_typed(nir_register, nir_reg, node, list) {
1072 if (nir_reg->num_array_elems == 0) {
1073 struct ureg_dst decl;
1074 uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
1075 if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
1076 if (nir_reg->bit_size == 64) {
1077 if (nir_reg->num_components > 2) {
1078 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1079 nir_reg->num_components, nir_reg->index);
1080 }
1081
1082 write_mask = ntt_64bit_write_mask(write_mask);
1083 }
1084
1085 decl = ureg_writemask(ntt_temp(c), write_mask);
1086 }
1087 c->reg_temp[nir_reg->index] = decl;
1088 }
1089 }
1090 }
1091
1092 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1093 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1094 {
1095 int num_components = instr->def.num_components;
1096
1097 if (!c->native_integers) {
1098 float values[4];
1099 assert(instr->def.bit_size == 32);
1100 for (int i = 0; i < num_components; i++)
1101 values[i] = uif(instr->value[i].u32);
1102
1103 return ureg_DECL_immediate(c->ureg, values, num_components);
1104 } else {
1105 uint32_t values[4];
1106
1107 if (instr->def.bit_size == 32) {
1108 for (int i = 0; i < num_components; i++)
1109 values[i] = instr->value[i].u32;
1110 } else {
1111 assert(num_components <= 2);
1112 for (int i = 0; i < num_components; i++) {
1113 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1114 values[i * 2 + 1] = instr->value[i].u64 >> 32;
1115 }
1116 num_components *= 2;
1117 }
1118
1119 return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1120 }
1121 }
1122
1123 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1124 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1125 {
1126 assert(addr_index < ARRAY_SIZE(c->addr_reg));
1127
1128 for (int i = 0; i <= addr_index; i++) {
1129 if (!c->addr_declared[i]) {
1130 c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1131 TGSI_WRITEMASK_X);
1132 c->addr_declared[i] = true;
1133 }
1134 }
1135
1136 if (c->native_integers)
1137 ntt_UARL(c, c->addr_reg[addr_index], addr);
1138 else
1139 ntt_ARL(c, c->addr_reg[addr_index], addr);
1140 return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1141 }
1142
1143 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1144 ntt_get_src(struct ntt_compile *c, nir_src src)
1145 {
1146 if (src.is_ssa) {
1147 if (src.ssa->parent_instr->type == nir_instr_type_load_const)
1148 return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
1149
1150 return c->ssa_temp[src.ssa->index];
1151 } else {
1152 nir_register *reg = src.reg.reg;
1153 struct ureg_dst reg_temp = c->reg_temp[reg->index];
1154 reg_temp.Index += src.reg.base_offset;
1155
1156 if (src.reg.indirect) {
1157 struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
1158 return ureg_src_indirect(ureg_src(reg_temp),
1159 ntt_reladdr(c, offset, 0));
1160 } else {
1161 return ureg_src(reg_temp);
1162 }
1163 }
1164 }
1165
1166 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1167 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1168 {
1169 nir_alu_src src = instr->src[i];
1170 struct ureg_src usrc = ntt_get_src(c, src.src);
1171
1172 /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1173 * channels. We skip this for undefs, as those don't get split to vec2s (but
1174 * the specific swizzles from an undef don't matter)
1175 */
1176 if (nir_src_bit_size(src.src) == 64 &&
1177 !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_ssa_undef)) {
1178 int chan0 = 0, chan1 = 1;
1179 if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1180 chan0 = ffs(instr->dest.write_mask) - 1;
1181 chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
1182 if (chan1 == -1)
1183 chan1 = chan0;
1184 }
1185 usrc = ureg_swizzle(usrc,
1186 src.swizzle[chan0] * 2,
1187 src.swizzle[chan0] * 2 + 1,
1188 src.swizzle[chan1] * 2,
1189 src.swizzle[chan1] * 2 + 1);
1190 } else {
1191 usrc = ureg_swizzle(usrc,
1192 src.swizzle[0],
1193 src.swizzle[1],
1194 src.swizzle[2],
1195 src.swizzle[3]);
1196 }
1197
1198 if (src.abs)
1199 usrc = ureg_abs(usrc);
1200 if (src.negate)
1201 usrc = ureg_negate(usrc);
1202
1203 return usrc;
1204 }
1205
1206 /* Reswizzles a source so that the unset channels in the write mask still refer
1207 * to one of the channels present in the write mask.
1208 */
1209 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1210 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1211 {
1212 assert(write_mask);
1213 int first_chan = ffs(write_mask) - 1;
1214 return ureg_swizzle(src,
1215 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1216 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1217 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1218 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1219 }
1220
1221 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)1222 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
1223 {
1224 uint32_t writemask = BITSET_MASK(ssa->num_components);
1225 if (ssa->bit_size == 64)
1226 writemask = ntt_64bit_write_mask(writemask);
1227
1228 struct ureg_dst dst;
1229 if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
1230 dst = ntt_temp(c);
1231
1232 c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1233
1234 return ureg_writemask(dst, writemask);
1235 }
1236
1237 static struct ureg_dst
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)1238 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
1239 {
1240 if (dest->is_ssa)
1241 return ntt_get_ssa_def_decl(c, &dest->ssa);
1242 else
1243 return c->reg_temp[dest->reg.reg->index];
1244 }
1245
1246 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)1247 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
1248 {
1249 struct ureg_dst dst = ntt_get_dest_decl(c, dest);
1250
1251 if (!dest->is_ssa) {
1252 dst.Index += dest->reg.base_offset;
1253
1254 if (dest->reg.indirect) {
1255 struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
1256 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1257 }
1258 }
1259
1260 return dst;
1261 }
1262
1263 /* For an SSA dest being populated by a constant src, replace the storage with
1264 * a copy of the ureg_src.
1265 */
1266 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)1267 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
1268 {
1269 if (!src.Indirect && !src.DimIndirect) {
1270 switch (src.File) {
1271 case TGSI_FILE_IMMEDIATE:
1272 case TGSI_FILE_INPUT:
1273 case TGSI_FILE_CONSTANT:
1274 case TGSI_FILE_SYSTEM_VALUE:
1275 c->ssa_temp[def->index] = src;
1276 return;
1277 }
1278 }
1279
1280 ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1281 }
1282
1283 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)1284 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
1285 {
1286 if (dest->is_ssa)
1287 ntt_store_def(c, &dest->ssa, src);
1288 else {
1289 struct ureg_dst dst = ntt_get_dest(c, dest);
1290 ntt_MOV(c, dst, src);
1291 }
1292 }
1293
1294 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1295 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1296 struct ureg_dst dst,
1297 struct ureg_src src0,
1298 struct ureg_src src1)
1299 {
1300 unsigned i;
1301
1302 /* POW is the only 2-operand scalar op. */
1303 if (tgsi_op != TGSI_OPCODE_POW)
1304 src1 = src0;
1305
1306 for (i = 0; i < 4; i++) {
1307 if (dst.WriteMask & (1 << i)) {
1308 ntt_insn(c, tgsi_op,
1309 ureg_writemask(dst, 1 << i),
1310 ureg_scalar(src0, i),
1311 ureg_scalar(src1, i),
1312 ureg_src_undef(), ureg_src_undef());
1313 }
1314 }
1315 }
1316
1317 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1318 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1319 {
1320 struct ureg_src src[4];
1321 struct ureg_dst dst;
1322 unsigned i;
1323 int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
1324 int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1325 int num_srcs = nir_op_infos[instr->op].num_inputs;
1326
1327 c->precise = instr->exact;
1328
1329 assert(num_srcs <= ARRAY_SIZE(src));
1330 for (i = 0; i < num_srcs; i++)
1331 src[i] = ntt_get_alu_src(c, instr, i);
1332 for (; i < ARRAY_SIZE(src); i++)
1333 src[i] = ureg_src_undef();
1334
1335 dst = ntt_get_dest(c, &instr->dest.dest);
1336
1337 if (instr->dest.saturate)
1338 dst.Saturate = true;
1339
1340 if (dst_64)
1341 dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
1342 else
1343 dst = ureg_writemask(dst, instr->dest.write_mask);
1344
1345 static enum tgsi_opcode op_map[][2] = {
1346 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1347
1348 /* fabs/fneg 32-bit are special-cased below. */
1349 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1350 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1351
1352 [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1353 [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1354 [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1355 [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1356 [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1357 [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1358 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1359 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1360 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1361 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1362 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1363 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1364 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1365
1366 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1367 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1368 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1369
1370 /* The conversions will have one combination of src and dst bitsize. */
1371 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1372 [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1373 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1374
1375 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1376 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1377 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1378 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1379 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1380 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1381 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1382 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1383
1384 [nir_op_slt] = { TGSI_OPCODE_SLT },
1385 [nir_op_sge] = { TGSI_OPCODE_SGE },
1386 [nir_op_seq] = { TGSI_OPCODE_SEQ },
1387 [nir_op_sne] = { TGSI_OPCODE_SNE },
1388
1389 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1390 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1391 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1392 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1393
1394 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1395 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1396 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1397 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1398
1399 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1400 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1401
1402 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1403 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1404 [nir_op_fsign] = { TGSI_OPCODE_SSG },
1405 [nir_op_isign] = { TGSI_OPCODE_ISSG },
1406 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1407 [nir_op_fddx] = { TGSI_OPCODE_DDX },
1408 [nir_op_fddy] = { TGSI_OPCODE_DDY },
1409 [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1410 [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1411 [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1412 [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1413 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1414 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1415 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1416 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1417 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1418 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1419 [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1420 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1421 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1422 [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1423 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1424 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1425 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1426 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1427 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1428 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1429 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1430 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1431 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1432 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1433 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1434
1435 /* These bitwise ops don't care about 32 vs 64 types, so they have the
1436 * same TGSI op.
1437 */
1438 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1439 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1440 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1441 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1442
1443 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1444 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1445 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1446 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1447 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1448 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1449 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1450 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1451 };
1452
1453 if (src_64 && !dst_64) {
1454 if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1455 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1456 * of .xy.
1457 */
1458 assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1459 } else {
1460 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1461 * channels and will need to get fixed up.
1462 */
1463 assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1464 }
1465 }
1466
1467 bool table_op64 = src_64;
1468 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1469 /* The normal path for NIR to TGSI ALU op translation */
1470 ntt_insn(c, op_map[instr->op][table_op64],
1471 dst, src[0], src[1], src[2], src[3]);
1472 } else {
1473 /* Special cases for NIR to TGSI ALU op translation. */
1474
1475 /* TODO: Use something like the ntt_store() path for the MOV calls so we
1476 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1477 */
1478
1479 switch (instr->op) {
1480 case nir_op_u2u64:
1481 ntt_AND(c, dst, ureg_swizzle(src[0],
1482 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1483 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1484 ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1485 break;
1486
1487 case nir_op_i2i32:
1488 case nir_op_u2u32:
1489 assert(src_64);
1490 ntt_MOV(c, dst, ureg_swizzle(src[0],
1491 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1492 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1493 break;
1494
1495 case nir_op_fabs:
1496 if (c->options->lower_fabs)
1497 ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1498 else
1499 ntt_MOV(c, dst, ureg_abs(src[0]));
1500 break;
1501
1502 case nir_op_fsat:
1503 if (dst_64) {
1504 ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1505 ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1506 } else {
1507 ntt_MOV(c, ureg_saturate(dst), src[0]);
1508 }
1509 break;
1510
1511 case nir_op_fneg:
1512 ntt_MOV(c, dst, ureg_negate(src[0]));
1513 break;
1514
1515 /* NOTE: TGSI 32-bit math ops have the old "one source channel
1516 * replicated to all dst channels" behavior, while 64 is normal mapping
1517 * of src channels to dst.
1518 */
1519 case nir_op_frcp:
1520 assert(!dst_64);
1521 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1522 break;
1523
1524 case nir_op_frsq:
1525 assert(!dst_64);
1526 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1527 break;
1528
1529 case nir_op_fsqrt:
1530 assert(!dst_64);
1531 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1532 break;
1533
1534 case nir_op_fexp2:
1535 assert(!dst_64);
1536 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1537 break;
1538
1539 case nir_op_flog2:
1540 assert(!dst_64);
1541 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1542 break;
1543
1544 case nir_op_b2f32:
1545 ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1546 break;
1547
1548 case nir_op_b2f64:
1549 ntt_AND(c, dst,
1550 ureg_swizzle(src[0],
1551 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1552 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1553 ntt_64bit_1f(c));
1554 break;
1555
1556 case nir_op_f2b32:
1557 if (src_64)
1558 ntt_DSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1559 else
1560 ntt_FSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1561 break;
1562
1563 case nir_op_i2b32:
1564 if (src_64) {
1565 ntt_U64SNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1566 } else
1567 ntt_USNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1568 break;
1569
1570 case nir_op_b2i32:
1571 ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1572 break;
1573
1574 case nir_op_b2i64:
1575 ntt_AND(c, dst,
1576 ureg_swizzle(src[0],
1577 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1578 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1579 ureg_imm4u(c->ureg, 1, 0, 1, 0));
1580 break;
1581
1582 case nir_op_fsin:
1583 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1584 break;
1585
1586 case nir_op_fcos:
1587 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1588 break;
1589
1590 case nir_op_fsub:
1591 assert(!dst_64);
1592 ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1593 break;
1594
1595 case nir_op_isub:
1596 assert(!dst_64);
1597 ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1598 break;
1599
1600 case nir_op_fmod:
1601 unreachable("should be handled by .lower_fmod = true");
1602 break;
1603
1604 case nir_op_fpow:
1605 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1606 break;
1607
1608 case nir_op_flrp:
1609 ntt_LRP(c, dst, src[2], src[1], src[0]);
1610 break;
1611
1612 case nir_op_pack_64_2x32_split:
1613 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1614 ureg_swizzle(src[0],
1615 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1616 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1617 ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1618 ureg_swizzle(src[1],
1619 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1620 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1621 break;
1622
1623 case nir_op_unpack_64_2x32_split_x:
1624 ntt_MOV(c, dst, ureg_swizzle(src[0],
1625 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1626 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1627 break;
1628
1629 case nir_op_unpack_64_2x32_split_y:
1630 ntt_MOV(c, dst, ureg_swizzle(src[0],
1631 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1632 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1633 break;
1634
1635 case nir_op_b32csel:
1636 if (nir_src_bit_size(instr->src[1].src) == 64) {
1637 ntt_UCMP(c, dst, ureg_swizzle(src[0],
1638 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1639 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1640 src[1], src[2]);
1641 } else {
1642 ntt_UCMP(c, dst, src[0], src[1], src[2]);
1643 }
1644 break;
1645
1646 case nir_op_fcsel:
1647 /* NIR fcsel is src0 != 0 ? src1 : src2.
1648 * TGSI CMP is src0 < 0 ? src1 : src2.
1649 *
1650 * However, fcsel so far as I can find only appears on bools-as-floats
1651 * (1.0 or 0.0), so we can just negate it for the TGSI op. It's
1652 * important to not have an abs here, as i915g has to make extra
1653 * instructions to do the abs.
1654 */
1655 if (c->options->lower_cmp) {
1656 /* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a
1657 * LRP on the boolean 1.0/0.0 value, instead of requiring the
1658 * backend to turn the src0 into 1.0/0.0 first.
1659 *
1660 * We don't use this in general because some hardware (i915 FS) the
1661 * LRP gets expanded to MUL/MAD.
1662 */
1663 ntt_LRP(c, dst, src[0], src[1], src[2]);
1664 } else {
1665 ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1666 }
1667 break;
1668
1669 /* It would be nice if we could get this left as scalar in NIR, since
1670 * the TGSI op is scalar.
1671 */
1672 case nir_op_frexp_sig:
1673 case nir_op_frexp_exp: {
1674 assert(src_64);
1675 struct ureg_dst temp = ntt_temp(c);
1676
1677 for (int chan = 0; chan < 2; chan++) {
1678 int wm = 1 << chan;
1679
1680 if (!(instr->dest.write_mask & wm))
1681 continue;
1682
1683 struct ureg_dst dsts[2] = { temp, temp };
1684 if (instr->op == nir_op_frexp_sig) {
1685 dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1686 } else {
1687 dsts[1] = ureg_writemask(dst, wm);
1688 }
1689
1690 struct ureg_src chan_src = ureg_swizzle(src[0],
1691 chan * 2, chan * 2 + 1,
1692 chan * 2, chan * 2 + 1);
1693
1694 struct ntt_insn *insn = ntt_insn(c, TGSI_OPCODE_DFRACEXP,
1695 dsts[0], chan_src,
1696 ureg_src_undef(),
1697 ureg_src_undef(),
1698 ureg_src_undef());
1699 insn->dst[1] = dsts[1];
1700 }
1701 break;
1702 }
1703
1704 case nir_op_ldexp:
1705 assert(dst_64); /* 32bit handled in table. */
1706 ntt_DLDEXP(c, dst, src[0],
1707 ureg_swizzle(src[1],
1708 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1709 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1710 break;
1711
1712 case nir_op_vec4:
1713 case nir_op_vec3:
1714 case nir_op_vec2:
1715 unreachable("covered by nir_lower_vec_to_movs()");
1716
1717 default:
1718 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1719 unreachable("Unknown NIR opcode");
1720 }
1721 }
1722
1723 c->precise = false;
1724 }
1725
1726 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1727 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1728 nir_src src, int addr_reg)
1729 {
1730 if (nir_src_is_const(src)) {
1731 usrc.Index += ntt_src_as_uint(c, src);
1732 return usrc;
1733 } else {
1734 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1735 }
1736 }
1737
1738 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1739 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1740 nir_src src)
1741 {
1742 if (nir_src_is_const(src)) {
1743 dst.Index += ntt_src_as_uint(c, src);
1744 return dst;
1745 } else {
1746 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1747 }
1748 }
1749
1750 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1751 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1752 nir_src src)
1753 {
1754 if (nir_src_is_const(src)) {
1755 return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1756 }
1757 else
1758 {
1759 return ureg_src_dimension_indirect(usrc,
1760 ntt_reladdr(c, ntt_get_src(c, src), 1),
1761 0);
1762 }
1763 }
1764
1765 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1766 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1767 nir_src src)
1768 {
1769 if (nir_src_is_const(src)) {
1770 return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1771 } else {
1772 return ureg_dst_dimension_indirect(udst,
1773 ntt_reladdr(c, ntt_get_src(c, src), 1),
1774 0);
1775 }
1776 }
1777 /* Some load operations in NIR will have a fractional offset that we need to
1778 * swizzle down before storing to the result register.
1779 */
1780 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1781 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1782 {
1783 return ureg_swizzle(src,
1784 frac,
1785 frac + MIN2(num_components - 1, 1),
1786 frac + MIN2(num_components - 1, 2),
1787 frac + MIN2(num_components - 1, 3));
1788 }
1789
1790
1791 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1792 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1793 {
1794 int bit_size = nir_dest_bit_size(instr->dest);
1795 assert(bit_size == 32 || instr->num_components <= 2);
1796
1797 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1798
1799 struct ureg_dst addr_temp = ureg_dst_undef();
1800
1801 if (nir_src_is_const(instr->src[0])) {
1802 src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1803 } else {
1804 /* virglrenderer requires that indirect UBO references have the UBO
1805 * array's base index in the Index field, not added to the indrect
1806 * address.
1807 *
1808 * Many nir intrinsics have a base address const value for the start of
1809 * their array indirection, but load_ubo doesn't. We fake it by
1810 * subtracting it off here.
1811 */
1812 addr_temp = ntt_temp(c);
1813 ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1814 src = ureg_src_dimension_indirect(src,
1815 ntt_reladdr(c, ureg_src(addr_temp), 1),
1816 c->first_ubo);
1817 }
1818
1819 if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1820 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1821 * file.
1822 */
1823 src.Index = nir_intrinsic_base(instr);
1824
1825 if (nir_src_is_const(instr->src[1])) {
1826 src.Index += ntt_src_as_uint(c, instr->src[1]);
1827 } else {
1828 src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1829 }
1830
1831 int start_component = nir_intrinsic_component(instr);
1832 if (bit_size == 64)
1833 start_component *= 2;
1834
1835 src = ntt_shift_by_frac(src, start_component,
1836 instr->num_components * bit_size / 32);
1837
1838 ntt_store(c, &instr->dest, src);
1839 } else {
1840 /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1841 * TGSI_OPCODE_LOAD instruction from the const file.
1842 */
1843 struct ntt_insn *insn =
1844 ntt_insn(c, TGSI_OPCODE_LOAD,
1845 ntt_get_dest(c, &instr->dest),
1846 src, ntt_get_src(c, instr->src[1]),
1847 ureg_src_undef(), ureg_src_undef());
1848 insn->is_mem = true;
1849 insn->tex_target = 0;
1850 insn->mem_qualifier = 0;
1851 insn->mem_format = 0; /* unused */
1852 }
1853 }
1854
1855 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1856 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1857 {
1858 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1859 unsigned qualifier = 0;
1860
1861 if (access & ACCESS_COHERENT)
1862 qualifier |= TGSI_MEMORY_COHERENT;
1863 if (access & ACCESS_VOLATILE)
1864 qualifier |= TGSI_MEMORY_VOLATILE;
1865 if (access & ACCESS_RESTRICT)
1866 qualifier |= TGSI_MEMORY_RESTRICT;
1867
1868 return qualifier;
1869 }
1870
1871 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1872 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1873 nir_variable_mode mode)
1874 {
1875 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1876 instr->intrinsic == nir_intrinsic_store_shared);
1877 bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1878 instr->intrinsic == nir_intrinsic_load_ssbo ||
1879 instr->intrinsic == nir_intrinsic_load_shared);
1880 unsigned opcode;
1881 struct ureg_src src[4];
1882 int num_src = 0;
1883 int next_src;
1884 struct ureg_dst addr_temp = ureg_dst_undef();
1885
1886 struct ureg_src memory;
1887 switch (mode) {
1888 case nir_var_mem_ssbo:
1889 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1890 instr->src[is_store ? 1 : 0], 2);
1891 next_src = 1;
1892 break;
1893 case nir_var_mem_shared:
1894 memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1895 next_src = 0;
1896 break;
1897 case nir_var_uniform: { /* HW atomic buffers */
1898 nir_src src = instr->src[0];
1899 uint32_t offset = ntt_extract_const_src_offset(&src) / 4;
1900 memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
1901 /* ntt_ureg_src_indirect, except dividing by 4 */
1902 if (nir_src_is_const(src)) {
1903 memory.Index += nir_src_as_uint(src) / 4;
1904 } else {
1905 addr_temp = ntt_temp(c);
1906 ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
1907 memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
1908 }
1909 memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1910 next_src = 0;
1911 break;
1912 }
1913
1914 default:
1915 unreachable("unknown memory type");
1916 }
1917
1918 if (is_store) {
1919 src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
1920 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1921 } else {
1922 src[num_src++] = memory;
1923 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1924 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
1925 switch (instr->intrinsic) {
1926 case nir_intrinsic_atomic_counter_inc:
1927 src[num_src++] = ureg_imm1i(c->ureg, 1);
1928 break;
1929 case nir_intrinsic_atomic_counter_post_dec:
1930 src[num_src++] = ureg_imm1i(c->ureg, -1);
1931 break;
1932 default:
1933 if (!is_load)
1934 src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
1935 break;
1936 }
1937 }
1938 }
1939
1940
1941 switch (instr->intrinsic) {
1942 case nir_intrinsic_atomic_counter_add:
1943 case nir_intrinsic_atomic_counter_inc:
1944 case nir_intrinsic_atomic_counter_post_dec:
1945 case nir_intrinsic_ssbo_atomic_add:
1946 case nir_intrinsic_shared_atomic_add:
1947 opcode = TGSI_OPCODE_ATOMUADD;
1948 break;
1949 case nir_intrinsic_ssbo_atomic_fadd:
1950 case nir_intrinsic_shared_atomic_fadd:
1951 opcode = TGSI_OPCODE_ATOMFADD;
1952 break;
1953 case nir_intrinsic_atomic_counter_min:
1954 case nir_intrinsic_ssbo_atomic_imin:
1955 case nir_intrinsic_shared_atomic_imin:
1956 opcode = TGSI_OPCODE_ATOMIMIN;
1957 break;
1958 case nir_intrinsic_atomic_counter_max:
1959 case nir_intrinsic_ssbo_atomic_imax:
1960 case nir_intrinsic_shared_atomic_imax:
1961 opcode = TGSI_OPCODE_ATOMIMAX;
1962 break;
1963 case nir_intrinsic_ssbo_atomic_umin:
1964 case nir_intrinsic_shared_atomic_umin:
1965 opcode = TGSI_OPCODE_ATOMUMIN;
1966 break;
1967 case nir_intrinsic_ssbo_atomic_umax:
1968 case nir_intrinsic_shared_atomic_umax:
1969 opcode = TGSI_OPCODE_ATOMUMAX;
1970 break;
1971 case nir_intrinsic_atomic_counter_and:
1972 case nir_intrinsic_ssbo_atomic_and:
1973 case nir_intrinsic_shared_atomic_and:
1974 opcode = TGSI_OPCODE_ATOMAND;
1975 break;
1976 case nir_intrinsic_atomic_counter_or:
1977 case nir_intrinsic_ssbo_atomic_or:
1978 case nir_intrinsic_shared_atomic_or:
1979 opcode = TGSI_OPCODE_ATOMOR;
1980 break;
1981 case nir_intrinsic_atomic_counter_xor:
1982 case nir_intrinsic_ssbo_atomic_xor:
1983 case nir_intrinsic_shared_atomic_xor:
1984 opcode = TGSI_OPCODE_ATOMXOR;
1985 break;
1986 case nir_intrinsic_atomic_counter_exchange:
1987 case nir_intrinsic_ssbo_atomic_exchange:
1988 case nir_intrinsic_shared_atomic_exchange:
1989 opcode = TGSI_OPCODE_ATOMXCHG;
1990 break;
1991 case nir_intrinsic_atomic_counter_comp_swap:
1992 case nir_intrinsic_ssbo_atomic_comp_swap:
1993 case nir_intrinsic_shared_atomic_comp_swap:
1994 opcode = TGSI_OPCODE_ATOMCAS;
1995 src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
1996 break;
1997 case nir_intrinsic_atomic_counter_read:
1998 case nir_intrinsic_load_ssbo:
1999 case nir_intrinsic_load_shared:
2000 opcode = TGSI_OPCODE_LOAD;
2001 break;
2002 case nir_intrinsic_store_ssbo:
2003 case nir_intrinsic_store_shared:
2004 opcode = TGSI_OPCODE_STORE;
2005 break;
2006 case nir_intrinsic_get_ssbo_size:
2007 opcode = TGSI_OPCODE_RESQ;
2008 break;
2009 default:
2010 unreachable("unknown memory op");
2011 }
2012
2013 unsigned qualifier = 0;
2014 if (mode == nir_var_mem_ssbo &&
2015 instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2016 qualifier = ntt_get_access_qualifier(instr);
2017 }
2018
2019 struct ureg_dst dst;
2020 if (is_store) {
2021 dst = ureg_dst(memory);
2022
2023 unsigned write_mask = nir_intrinsic_write_mask(instr);
2024 if (nir_src_bit_size(instr->src[0]) == 64)
2025 write_mask = ntt_64bit_write_mask(write_mask);
2026 dst = ureg_writemask(dst, write_mask);
2027 } else {
2028 dst = ntt_get_dest(c, &instr->dest);
2029 }
2030
2031 struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2032 insn->tex_target = TGSI_TEXTURE_BUFFER;
2033 insn->mem_qualifier = qualifier;
2034 insn->mem_format = 0; /* unused */
2035 insn->is_mem = true;
2036 }
2037
2038 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2039 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2040 {
2041 unsigned op;
2042 struct ureg_src srcs[4];
2043 int num_src = 0;
2044 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2045 bool is_array = nir_intrinsic_image_array(instr);
2046
2047 struct ureg_dst temp = ureg_dst_undef();
2048
2049 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2050
2051 struct ureg_src resource =
2052 ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2053 instr->src[0], 2);
2054
2055 struct ureg_dst dst;
2056 if (instr->intrinsic == nir_intrinsic_image_store) {
2057 dst = ureg_dst(resource);
2058 } else {
2059 srcs[num_src++] = resource;
2060 dst = ntt_get_dest(c, &instr->dest);
2061 }
2062 struct ureg_dst opcode_dst = dst;
2063
2064 if (instr->intrinsic != nir_intrinsic_image_size && instr->intrinsic != nir_intrinsic_image_samples) {
2065 struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2066
2067 if (dim == GLSL_SAMPLER_DIM_MS) {
2068 temp = ntt_temp(c);
2069 ntt_MOV(c, temp, coord);
2070 ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2071 ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2072 coord = ureg_src(temp);
2073 }
2074 srcs[num_src++] = coord;
2075
2076 if (instr->intrinsic != nir_intrinsic_image_load) {
2077 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2078 if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
2079 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2080 }
2081 }
2082
2083 switch (instr->intrinsic) {
2084 case nir_intrinsic_image_load:
2085 op = TGSI_OPCODE_LOAD;
2086 break;
2087 case nir_intrinsic_image_store:
2088 op = TGSI_OPCODE_STORE;
2089 break;
2090 case nir_intrinsic_image_size:
2091 op = TGSI_OPCODE_RESQ;
2092 break;
2093 case nir_intrinsic_image_samples:
2094 op = TGSI_OPCODE_RESQ;
2095 opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2096 break;
2097 case nir_intrinsic_image_atomic_add:
2098 op = TGSI_OPCODE_ATOMUADD;
2099 break;
2100 case nir_intrinsic_image_atomic_fadd:
2101 op = TGSI_OPCODE_ATOMFADD;
2102 break;
2103 case nir_intrinsic_image_atomic_imin:
2104 op = TGSI_OPCODE_ATOMIMIN;
2105 break;
2106 case nir_intrinsic_image_atomic_umin:
2107 op = TGSI_OPCODE_ATOMUMIN;
2108 break;
2109 case nir_intrinsic_image_atomic_imax:
2110 op = TGSI_OPCODE_ATOMIMAX;
2111 break;
2112 case nir_intrinsic_image_atomic_umax:
2113 op = TGSI_OPCODE_ATOMUMAX;
2114 break;
2115 case nir_intrinsic_image_atomic_and:
2116 op = TGSI_OPCODE_ATOMAND;
2117 break;
2118 case nir_intrinsic_image_atomic_or:
2119 op = TGSI_OPCODE_ATOMOR;
2120 break;
2121 case nir_intrinsic_image_atomic_xor:
2122 op = TGSI_OPCODE_ATOMXOR;
2123 break;
2124 case nir_intrinsic_image_atomic_exchange:
2125 op = TGSI_OPCODE_ATOMXCHG;
2126 break;
2127 case nir_intrinsic_image_atomic_comp_swap:
2128 op = TGSI_OPCODE_ATOMCAS;
2129 break;
2130 default:
2131 unreachable("bad op");
2132 }
2133
2134 struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2135 insn->tex_target = target;
2136 insn->mem_qualifier = ntt_get_access_qualifier(instr);
2137 insn->mem_format = nir_intrinsic_format(instr);
2138 insn->is_mem = true;
2139
2140 if (instr->intrinsic == nir_intrinsic_image_samples)
2141 ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2142 }
2143
2144 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2145 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2146 {
2147 uint32_t frac = nir_intrinsic_component(instr);
2148 uint32_t num_components = instr->num_components;
2149 unsigned base = nir_intrinsic_base(instr);
2150 struct ureg_src input;
2151 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2152 bool is_64 = nir_dest_bit_size(instr->dest) == 64;
2153
2154 if (c->s->info.stage == MESA_SHADER_VERTEX) {
2155 input = ureg_DECL_vs_input(c->ureg, base);
2156 for (int i = 1; i < semantics.num_slots; i++)
2157 ureg_DECL_vs_input(c->ureg, base + i);
2158 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2159 unsigned semantic_name, semantic_index;
2160 ntt_get_gl_varying_semantic(c, semantics.location,
2161 &semantic_name, &semantic_index);
2162
2163 /* XXX: ArrayID is used in r600 gs inputs */
2164 uint32_t array_id = 0;
2165
2166 input = ureg_DECL_input_layout(c->ureg,
2167 semantic_name,
2168 semantic_index,
2169 base,
2170 ntt_tgsi_usage_mask(frac,
2171 instr->num_components,
2172 is_64),
2173 array_id,
2174 semantics.num_slots);
2175 } else {
2176 input = c->input_index_map[base];
2177 }
2178
2179 if (is_64)
2180 num_components *= 2;
2181
2182 input = ntt_shift_by_frac(input, frac, num_components);
2183
2184 switch (instr->intrinsic) {
2185 case nir_intrinsic_load_input:
2186 input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2187 ntt_store(c, &instr->dest, input);
2188 break;
2189
2190 case nir_intrinsic_load_per_vertex_input:
2191 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2192 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2193 ntt_store(c, &instr->dest, input);
2194 break;
2195
2196 case nir_intrinsic_load_interpolated_input: {
2197 input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2198
2199 nir_intrinsic_instr *bary_instr =
2200 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2201
2202 switch (bary_instr->intrinsic) {
2203 case nir_intrinsic_load_barycentric_pixel:
2204 case nir_intrinsic_load_barycentric_sample:
2205 /* For these, we know that the barycentric load matches the
2206 * interpolation on the input declaration, so we can use it directly.
2207 */
2208 ntt_store(c, &instr->dest, input);
2209 break;
2210
2211 case nir_intrinsic_load_barycentric_centroid:
2212 /* If the input was declared centroid, then there's no need to
2213 * emit the extra TGSI interp instruction, we can just read the
2214 * input.
2215 */
2216 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2217 ntt_store(c, &instr->dest, input);
2218 } else {
2219 ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->dest), input);
2220 }
2221 break;
2222
2223 case nir_intrinsic_load_barycentric_at_sample:
2224 /* We stored the sample in the fake "bary" dest. */
2225 ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->dest), input,
2226 ntt_get_src(c, instr->src[0]));
2227 break;
2228
2229 case nir_intrinsic_load_barycentric_at_offset:
2230 /* We stored the offset in the fake "bary" dest. */
2231 ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->dest), input,
2232 ntt_get_src(c, instr->src[0]));
2233 break;
2234
2235 default:
2236 unreachable("bad barycentric interp intrinsic\n");
2237 }
2238 break;
2239 }
2240
2241 default:
2242 unreachable("bad load input intrinsic\n");
2243 }
2244 }
2245
2246 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2247 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2248 {
2249 struct ureg_src src = ntt_get_src(c, instr->src[0]);
2250
2251 if (src.File == TGSI_FILE_OUTPUT) {
2252 /* If our src is the output file, that's an indication that we were able
2253 * to emit the output stores in the generating instructions and we have
2254 * nothing to do here.
2255 */
2256 return;
2257 }
2258
2259 uint32_t frac;
2260 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2261
2262 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2263 out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2264 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2265 } else {
2266 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2267 }
2268
2269 uint8_t swizzle[4] = { 0, 0, 0, 0 };
2270 for (int i = frac; i <= 4; i++) {
2271 if (out.WriteMask & (1 << i))
2272 swizzle[i] = i - frac;
2273 }
2274
2275 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2276
2277 ntt_MOV(c, out, src);
2278 }
2279
2280 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2281 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2282 {
2283 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2284
2285 /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2286 * load_output is present.
2287 */
2288 assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2289 (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2290
2291 uint32_t frac;
2292 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2293
2294 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2295 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2296 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2297 } else {
2298 out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2299 }
2300
2301 struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2302 struct ureg_src out_src = ureg_src(out);
2303
2304 /* Don't swizzling unavailable channels of the output in the writemasked-out
2305 * components. Avoids compile failures in virglrenderer with
2306 * TESS_LEVEL_INNER.
2307 */
2308 int fill_channel = ffs(dst.WriteMask) - 1;
2309 uint8_t swizzles[4] = { 0, 1, 2, 3 };
2310 for (int i = 0; i < 4; i++)
2311 if (!(dst.WriteMask & (1 << i)))
2312 swizzles[i] = fill_channel;
2313 out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2314
2315 if (semantics.fb_fetch_output)
2316 ntt_FBFETCH(c, dst, out_src);
2317 else
2318 ntt_MOV(c, dst, out_src);
2319 }
2320
2321 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2322 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2323 {
2324 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2325 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2326 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2327
2328 /* virglrenderer doesn't like references to channels of the sysval that
2329 * aren't defined, even if they aren't really read. (GLSL compile fails on
2330 * gl_NumWorkGroups.w, for example).
2331 */
2332 uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
2333 sv = ntt_swizzle_for_write_mask(sv, write_mask);
2334
2335 /* TGSI and NIR define these intrinsics as always loading ints, but they can
2336 * still appear on hardware with non-native-integers fragment shaders using
2337 * the draw path (i915g). In that case, having called nir_lower_int_to_float
2338 * means that we actually want floats instead.
2339 */
2340 if (!c->native_integers) {
2341 switch (instr->intrinsic) {
2342 case nir_intrinsic_load_vertex_id:
2343 case nir_intrinsic_load_instance_id:
2344 ntt_U2F(c, ntt_get_dest(c, &instr->dest), sv);
2345 return;
2346
2347 default:
2348 break;
2349 }
2350 }
2351
2352 ntt_store(c, &instr->dest, sv);
2353 }
2354
2355 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2356 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2357 {
2358 switch (instr->intrinsic) {
2359 case nir_intrinsic_load_ubo:
2360 case nir_intrinsic_load_ubo_vec4:
2361 ntt_emit_load_ubo(c, instr);
2362 break;
2363
2364 /* Vertex */
2365 case nir_intrinsic_load_vertex_id:
2366 case nir_intrinsic_load_vertex_id_zero_base:
2367 case nir_intrinsic_load_base_vertex:
2368 case nir_intrinsic_load_base_instance:
2369 case nir_intrinsic_load_instance_id:
2370 case nir_intrinsic_load_draw_id:
2371 case nir_intrinsic_load_invocation_id:
2372 case nir_intrinsic_load_frag_coord:
2373 case nir_intrinsic_load_point_coord:
2374 case nir_intrinsic_load_front_face:
2375 case nir_intrinsic_load_sample_id:
2376 case nir_intrinsic_load_sample_pos:
2377 case nir_intrinsic_load_sample_mask_in:
2378 case nir_intrinsic_load_helper_invocation:
2379 case nir_intrinsic_load_tess_coord:
2380 case nir_intrinsic_load_patch_vertices_in:
2381 case nir_intrinsic_load_primitive_id:
2382 case nir_intrinsic_load_tess_level_outer:
2383 case nir_intrinsic_load_tess_level_inner:
2384 case nir_intrinsic_load_local_invocation_id:
2385 case nir_intrinsic_load_workgroup_id:
2386 case nir_intrinsic_load_num_workgroups:
2387 case nir_intrinsic_load_workgroup_size:
2388 case nir_intrinsic_load_subgroup_size:
2389 case nir_intrinsic_load_subgroup_invocation:
2390 case nir_intrinsic_load_subgroup_eq_mask:
2391 case nir_intrinsic_load_subgroup_ge_mask:
2392 case nir_intrinsic_load_subgroup_gt_mask:
2393 case nir_intrinsic_load_subgroup_lt_mask:
2394 ntt_emit_load_sysval(c, instr);
2395 break;
2396
2397 case nir_intrinsic_load_input:
2398 case nir_intrinsic_load_per_vertex_input:
2399 case nir_intrinsic_load_interpolated_input:
2400 ntt_emit_load_input(c, instr);
2401 break;
2402
2403 case nir_intrinsic_store_output:
2404 case nir_intrinsic_store_per_vertex_output:
2405 ntt_emit_store_output(c, instr);
2406 break;
2407
2408 case nir_intrinsic_load_output:
2409 case nir_intrinsic_load_per_vertex_output:
2410 ntt_emit_load_output(c, instr);
2411 break;
2412
2413 case nir_intrinsic_discard:
2414 ntt_KILL(c);
2415 break;
2416
2417 case nir_intrinsic_discard_if: {
2418 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2419
2420 if (c->native_integers) {
2421 struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2422 ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2423 ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2424 } else {
2425 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2426 ntt_KILL_IF(c, ureg_negate(cond));
2427 }
2428 break;
2429 }
2430
2431 case nir_intrinsic_load_ssbo:
2432 case nir_intrinsic_store_ssbo:
2433 case nir_intrinsic_ssbo_atomic_add:
2434 case nir_intrinsic_ssbo_atomic_fadd:
2435 case nir_intrinsic_ssbo_atomic_imin:
2436 case nir_intrinsic_ssbo_atomic_imax:
2437 case nir_intrinsic_ssbo_atomic_umin:
2438 case nir_intrinsic_ssbo_atomic_umax:
2439 case nir_intrinsic_ssbo_atomic_and:
2440 case nir_intrinsic_ssbo_atomic_or:
2441 case nir_intrinsic_ssbo_atomic_xor:
2442 case nir_intrinsic_ssbo_atomic_exchange:
2443 case nir_intrinsic_ssbo_atomic_comp_swap:
2444 case nir_intrinsic_get_ssbo_size:
2445 ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2446 break;
2447
2448 case nir_intrinsic_load_shared:
2449 case nir_intrinsic_store_shared:
2450 case nir_intrinsic_shared_atomic_add:
2451 case nir_intrinsic_shared_atomic_fadd:
2452 case nir_intrinsic_shared_atomic_imin:
2453 case nir_intrinsic_shared_atomic_imax:
2454 case nir_intrinsic_shared_atomic_umin:
2455 case nir_intrinsic_shared_atomic_umax:
2456 case nir_intrinsic_shared_atomic_and:
2457 case nir_intrinsic_shared_atomic_or:
2458 case nir_intrinsic_shared_atomic_xor:
2459 case nir_intrinsic_shared_atomic_exchange:
2460 case nir_intrinsic_shared_atomic_comp_swap:
2461 ntt_emit_mem(c, instr, nir_var_mem_shared);
2462 break;
2463
2464 case nir_intrinsic_atomic_counter_read:
2465 case nir_intrinsic_atomic_counter_add:
2466 case nir_intrinsic_atomic_counter_inc:
2467 case nir_intrinsic_atomic_counter_post_dec:
2468 case nir_intrinsic_atomic_counter_min:
2469 case nir_intrinsic_atomic_counter_max:
2470 case nir_intrinsic_atomic_counter_and:
2471 case nir_intrinsic_atomic_counter_or:
2472 case nir_intrinsic_atomic_counter_xor:
2473 case nir_intrinsic_atomic_counter_exchange:
2474 case nir_intrinsic_atomic_counter_comp_swap:
2475 ntt_emit_mem(c, instr, nir_var_uniform);
2476 break;
2477 case nir_intrinsic_atomic_counter_pre_dec:
2478 unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2479 break;
2480
2481 case nir_intrinsic_image_load:
2482 case nir_intrinsic_image_store:
2483 case nir_intrinsic_image_size:
2484 case nir_intrinsic_image_samples:
2485 case nir_intrinsic_image_atomic_add:
2486 case nir_intrinsic_image_atomic_fadd:
2487 case nir_intrinsic_image_atomic_imin:
2488 case nir_intrinsic_image_atomic_umin:
2489 case nir_intrinsic_image_atomic_imax:
2490 case nir_intrinsic_image_atomic_umax:
2491 case nir_intrinsic_image_atomic_and:
2492 case nir_intrinsic_image_atomic_or:
2493 case nir_intrinsic_image_atomic_xor:
2494 case nir_intrinsic_image_atomic_exchange:
2495 case nir_intrinsic_image_atomic_comp_swap:
2496 ntt_emit_image_load_store(c, instr);
2497 break;
2498
2499 case nir_intrinsic_control_barrier:
2500 case nir_intrinsic_memory_barrier_tcs_patch:
2501 ntt_BARRIER(c);
2502 break;
2503
2504 case nir_intrinsic_memory_barrier:
2505 ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2506 TGSI_MEMBAR_SHADER_BUFFER |
2507 TGSI_MEMBAR_ATOMIC_BUFFER |
2508 TGSI_MEMBAR_SHADER_IMAGE |
2509 TGSI_MEMBAR_SHARED));
2510 break;
2511
2512 case nir_intrinsic_memory_barrier_atomic_counter:
2513 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2514 break;
2515
2516 case nir_intrinsic_memory_barrier_buffer:
2517 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2518 break;
2519
2520 case nir_intrinsic_memory_barrier_image:
2521 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2522 break;
2523
2524 case nir_intrinsic_memory_barrier_shared:
2525 ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2526 break;
2527
2528 case nir_intrinsic_group_memory_barrier:
2529 ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2530 TGSI_MEMBAR_SHADER_BUFFER |
2531 TGSI_MEMBAR_ATOMIC_BUFFER |
2532 TGSI_MEMBAR_SHADER_IMAGE |
2533 TGSI_MEMBAR_SHARED |
2534 TGSI_MEMBAR_THREAD_GROUP));
2535 break;
2536
2537 case nir_intrinsic_end_primitive:
2538 ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2539 break;
2540
2541 case nir_intrinsic_emit_vertex:
2542 ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2543 break;
2544
2545 /* In TGSI we don't actually generate the barycentric coords, and emit
2546 * interp intrinsics later. However, we do need to store the
2547 * load_barycentric_at_* argument so that we can use it at that point.
2548 */
2549 case nir_intrinsic_load_barycentric_pixel:
2550 case nir_intrinsic_load_barycentric_centroid:
2551 case nir_intrinsic_load_barycentric_sample:
2552 break;
2553 case nir_intrinsic_load_barycentric_at_sample:
2554 case nir_intrinsic_load_barycentric_at_offset:
2555 ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2556 break;
2557
2558 case nir_intrinsic_shader_clock:
2559 ntt_CLOCK(c, ntt_get_dest(c, &instr->dest));
2560 break;
2561
2562 default:
2563 fprintf(stderr, "Unknown intrinsic: ");
2564 nir_print_instr(&instr->instr, stderr);
2565 fprintf(stderr, "\n");
2566 break;
2567 }
2568 }
2569
2570 struct ntt_tex_operand_state {
2571 struct ureg_src srcs[4];
2572 unsigned i;
2573 };
2574
2575 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2576 ntt_push_tex_arg(struct ntt_compile *c,
2577 nir_tex_instr *instr,
2578 nir_tex_src_type tex_src_type,
2579 struct ntt_tex_operand_state *s)
2580 {
2581 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2582 if (tex_src < 0)
2583 return;
2584
2585 nir_src *src = &instr->src[tex_src].src;
2586
2587 /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2588 * that TG4's immediate offset arg is float-typed.
2589 */
2590 if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2591 nir_src_is_const(*src)) {
2592 nir_const_value *consts = nir_src_as_const_value(*src);
2593 s->srcs[s->i++] = ureg_imm4f(c->ureg,
2594 consts[0].f32,
2595 consts[1].f32,
2596 consts[2].f32,
2597 consts[3].f32);
2598 return;
2599 }
2600
2601 s->srcs[s->i++] = ntt_get_src(c, *src);
2602 }
2603
2604 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2605 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2606 {
2607 struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2608 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2609 unsigned tex_opcode;
2610
2611 struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2612 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2613 if (sampler_src >= 0) {
2614 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2615 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2616 }
2617
2618 switch (instr->op) {
2619 case nir_texop_tex:
2620 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2621 MAX2(instr->coord_components, 2) + instr->is_shadow)
2622 tex_opcode = TGSI_OPCODE_TXP;
2623 else
2624 tex_opcode = TGSI_OPCODE_TEX;
2625 break;
2626 case nir_texop_txf:
2627 case nir_texop_txf_ms:
2628 tex_opcode = TGSI_OPCODE_TXF;
2629
2630 if (c->has_txf_lz) {
2631 int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2632 if (lod_src >= 0 &&
2633 nir_src_is_const(instr->src[lod_src].src) &&
2634 ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2635 tex_opcode = TGSI_OPCODE_TXF_LZ;
2636 }
2637 }
2638 break;
2639 case nir_texop_txl:
2640 tex_opcode = TGSI_OPCODE_TXL;
2641 break;
2642 case nir_texop_txb:
2643 tex_opcode = TGSI_OPCODE_TXB;
2644 break;
2645 case nir_texop_txd:
2646 tex_opcode = TGSI_OPCODE_TXD;
2647 break;
2648 case nir_texop_txs:
2649 tex_opcode = TGSI_OPCODE_TXQ;
2650 break;
2651 case nir_texop_tg4:
2652 tex_opcode = TGSI_OPCODE_TG4;
2653 break;
2654 case nir_texop_query_levels:
2655 tex_opcode = TGSI_OPCODE_TXQ;
2656 break;
2657 case nir_texop_lod:
2658 tex_opcode = TGSI_OPCODE_LODQ;
2659 break;
2660 case nir_texop_texture_samples:
2661 tex_opcode = TGSI_OPCODE_TXQS;
2662 break;
2663 default:
2664 unreachable("unsupported tex op");
2665 }
2666
2667 struct ntt_tex_operand_state s = { .i = 0 };
2668 ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2669 ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2670
2671 /* non-coord arg for TXQ */
2672 if (tex_opcode == TGSI_OPCODE_TXQ) {
2673 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2674 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2675 * scalar
2676 */
2677 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2678 }
2679
2680 if (s.i > 1) {
2681 if (tex_opcode == TGSI_OPCODE_TEX)
2682 tex_opcode = TGSI_OPCODE_TEX2;
2683 if (tex_opcode == TGSI_OPCODE_TXB)
2684 tex_opcode = TGSI_OPCODE_TXB2;
2685 if (tex_opcode == TGSI_OPCODE_TXL)
2686 tex_opcode = TGSI_OPCODE_TXL2;
2687 }
2688
2689 if (instr->op == nir_texop_txd) {
2690 /* Derivs appear in their own src args */
2691 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2692 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2693 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2694 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2695 }
2696
2697 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2698 if (c->screen->get_param(c->screen,
2699 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2700 sampler = ureg_scalar(sampler, instr->component);
2701 s.srcs[s.i++] = ureg_src_undef();
2702 } else {
2703 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2704 }
2705 }
2706
2707 s.srcs[s.i++] = sampler;
2708
2709 enum tgsi_return_type tex_type;
2710 switch (instr->dest_type) {
2711 case nir_type_float32:
2712 tex_type = TGSI_RETURN_TYPE_FLOAT;
2713 break;
2714 case nir_type_int32:
2715 tex_type = TGSI_RETURN_TYPE_SINT;
2716 break;
2717 case nir_type_uint32:
2718 tex_type = TGSI_RETURN_TYPE_UINT;
2719 break;
2720 default:
2721 unreachable("unknown texture type");
2722 }
2723
2724 struct tgsi_texture_offset tex_offset = {
2725 .File = TGSI_FILE_NULL
2726 };
2727 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2728 if (tex_offset_src >= 0) {
2729 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2730
2731 tex_offset.File = offset.File;
2732 tex_offset.Index = offset.Index;
2733 tex_offset.SwizzleX = offset.SwizzleX;
2734 tex_offset.SwizzleY = offset.SwizzleY;
2735 tex_offset.SwizzleZ = offset.SwizzleZ;
2736 tex_offset.Padding = 0;
2737 }
2738
2739 struct ureg_dst tex_dst;
2740 if (instr->op == nir_texop_query_levels)
2741 tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2742 else
2743 tex_dst = dst;
2744
2745 while (s.i < 4)
2746 s.srcs[s.i++] = ureg_src_undef();
2747
2748 struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2749 insn->tex_target = target;
2750 insn->tex_return_type = tex_type;
2751 insn->tex_offset = tex_offset;
2752 insn->is_tex = true;
2753
2754 if (instr->op == nir_texop_query_levels)
2755 ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2756 }
2757
2758 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2759 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2760 {
2761 switch (jump->type) {
2762 case nir_jump_break:
2763 ntt_BRK(c);
2764 break;
2765
2766 case nir_jump_continue:
2767 ntt_CONT(c);
2768 break;
2769
2770 default:
2771 fprintf(stderr, "Unknown jump instruction: ");
2772 nir_print_instr(&jump->instr, stderr);
2773 fprintf(stderr, "\n");
2774 abort();
2775 }
2776 }
2777
2778 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)2779 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2780 {
2781 /* Nothing to do but make sure that we have some storage to deref. */
2782 (void)ntt_get_ssa_def_decl(c, &instr->def);
2783 }
2784
2785 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2786 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2787 {
2788 switch (instr->type) {
2789 case nir_instr_type_deref:
2790 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2791 break;
2792
2793 case nir_instr_type_alu:
2794 ntt_emit_alu(c, nir_instr_as_alu(instr));
2795 break;
2796
2797 case nir_instr_type_intrinsic:
2798 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2799 break;
2800
2801 case nir_instr_type_load_const:
2802 /* Nothing to do here, as load consts are done directly from
2803 * ntt_get_src() (since many constant NIR srcs will often get folded
2804 * directly into a register file index instead of as a TGSI src).
2805 */
2806 break;
2807
2808 case nir_instr_type_tex:
2809 ntt_emit_texture(c, nir_instr_as_tex(instr));
2810 break;
2811
2812 case nir_instr_type_jump:
2813 ntt_emit_jump(c, nir_instr_as_jump(instr));
2814 break;
2815
2816 case nir_instr_type_ssa_undef:
2817 ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2818 break;
2819
2820 default:
2821 fprintf(stderr, "Unknown NIR instr type: ");
2822 nir_print_instr(instr, stderr);
2823 fprintf(stderr, "\n");
2824 abort();
2825 }
2826 }
2827
2828 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2829 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2830 {
2831 if (c->native_integers)
2832 ntt_UIF(c, c->if_cond);
2833 else
2834 ntt_IF(c, c->if_cond);
2835
2836 ntt_emit_cf_list(c, &if_stmt->then_list);
2837
2838 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2839 ntt_ELSE(c);
2840 ntt_emit_cf_list(c, &if_stmt->else_list);
2841 }
2842
2843 ntt_ENDIF(c);
2844 }
2845
2846 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2847 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2848 {
2849 ntt_BGNLOOP(c);
2850 ntt_emit_cf_list(c, &loop->body);
2851 ntt_ENDLOOP(c);
2852 }
2853
2854 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2855 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2856 {
2857 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2858 c->cur_block = ntt_block;
2859
2860 nir_foreach_instr(instr, block) {
2861 ntt_emit_instr(c, instr);
2862
2863 /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
2864 if (ureg_get_instruction_number(c->ureg) != 0) {
2865 fprintf(stderr, "Emitted ureg insn during: ");
2866 nir_print_instr(instr, stderr);
2867 fprintf(stderr, "\n");
2868 unreachable("emitted ureg insn");
2869 }
2870 }
2871
2872 /* Set up the if condition for ntt_emit_if(), which we have to do before
2873 * freeing up the temps (the "if" is treated as inside the block for liveness
2874 * purposes, despite not being an instruction)
2875 *
2876 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2877 * looks at all of .xyzw. No harm in working around the bug.
2878 */
2879 nir_if *nif = nir_block_get_following_if(block);
2880 if (nif)
2881 c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2882 }
2883
2884 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2885 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2886 {
2887 foreach_list_typed(nir_cf_node, node, node, list) {
2888 switch (node->type) {
2889 case nir_cf_node_block:
2890 ntt_emit_block(c, nir_cf_node_as_block(node));
2891 break;
2892
2893 case nir_cf_node_if:
2894 ntt_emit_if(c, nir_cf_node_as_if(node));
2895 break;
2896
2897 case nir_cf_node_loop:
2898 ntt_emit_loop(c, nir_cf_node_as_loop(node));
2899 break;
2900
2901 default:
2902 unreachable("unknown CF type");
2903 }
2904 }
2905 }
2906
2907 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)2908 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
2909 {
2910 struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2911
2912 /* Emit the ntt insns to tgsi_ureg. */
2913 util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
2914 const struct tgsi_opcode_info *opcode_info =
2915 tgsi_get_opcode_info(insn->opcode);
2916
2917 switch (insn->opcode) {
2918 case TGSI_OPCODE_UIF:
2919 ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
2920 break;
2921
2922 case TGSI_OPCODE_IF:
2923 ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2924 break;
2925
2926 case TGSI_OPCODE_ELSE:
2927 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2928 ureg_ELSE(c->ureg, &c->cf_label);
2929 c->current_if_else = c->cf_label;
2930 break;
2931
2932 case TGSI_OPCODE_ENDIF:
2933 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2934 ureg_ENDIF(c->ureg);
2935 break;
2936
2937 case TGSI_OPCODE_BGNLOOP:
2938 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2939 * does reference BGNLOOP's. Follow the former behavior unless something comes up
2940 * with a need.
2941 */
2942 ureg_BGNLOOP(c->ureg, &c->cf_label);
2943 break;
2944
2945 case TGSI_OPCODE_ENDLOOP:
2946 ureg_ENDLOOP(c->ureg, &c->cf_label);
2947 break;
2948
2949 default:
2950 if (insn->is_tex) {
2951 ureg_tex_insn(c->ureg, insn->opcode,
2952 insn->dst, opcode_info->num_dst,
2953 insn->tex_target, insn->tex_return_type,
2954 &insn->tex_offset,
2955 insn->tex_offset.File != TGSI_FILE_NULL ? 1 : 0,
2956 insn->src, opcode_info->num_src);
2957 } else if (insn->is_mem) {
2958 ureg_memory_insn(c->ureg, insn->opcode,
2959 insn->dst, opcode_info->num_dst,
2960 insn->src, opcode_info->num_src,
2961 insn->mem_qualifier,
2962 insn->tex_target,
2963 insn->mem_format);
2964 } else {
2965 ureg_insn(c->ureg, insn->opcode,
2966 insn->dst, opcode_info->num_dst,
2967 insn->src, opcode_info->num_src,
2968 insn->precise);
2969 }
2970 }
2971 }
2972 }
2973
2974 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)2975 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
2976 {
2977 /* Note: the last block emitted our IF opcode. */
2978
2979 int if_stack = c->current_if_else;
2980 c->current_if_else = c->cf_label;
2981
2982 /* Either the then or else block includes the ENDIF, which will fix up the
2983 * IF(/ELSE)'s label for jumping
2984 */
2985 ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
2986 ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
2987
2988 c->current_if_else = if_stack;
2989 }
2990
2991 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)2992 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
2993 {
2994 foreach_list_typed(nir_cf_node, node, node, list) {
2995 switch (node->type) {
2996 case nir_cf_node_block:
2997 ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
2998 break;
2999
3000 case nir_cf_node_if:
3001 ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3002 break;
3003
3004 case nir_cf_node_loop:
3005 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3006 * does reference BGNLOOP's. Follow the former behavior unless something comes up
3007 * with a need.
3008 */
3009 ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3010 break;
3011
3012 default:
3013 unreachable("unknown CF type");
3014 }
3015 }
3016 }
3017
3018 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3019 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3020 {
3021 c->impl = impl;
3022
3023 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3024 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
3025
3026 /* Set up the struct ntt_blocks to put insns in */
3027 c->blocks = _mesa_pointer_hash_table_create(c);
3028 nir_foreach_block(block, impl) {
3029 struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3030 util_dynarray_init(&ntt_block->insns, ntt_block);
3031 _mesa_hash_table_insert(c->blocks, block, ntt_block);
3032 }
3033
3034
3035 ntt_setup_registers(c, &impl->registers);
3036
3037 c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3038 ntt_setup_inputs(c);
3039 ntt_setup_outputs(c);
3040 ntt_setup_uniforms(c);
3041
3042 /* Emit the ntt insns */
3043 ntt_emit_cf_list(c, &impl->body);
3044
3045 /* Don't do optimized RA if the driver requests it, unless the number of
3046 * temps is too large to be covered by the 16 bit signed int that TGSI
3047 * allocates for the register index */
3048 if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3049 ntt_allocate_regs(c, impl);
3050 else
3051 ntt_allocate_regs_unoptimized(c, impl);
3052
3053 /* Turn the ntt insns into actual TGSI tokens */
3054 ntt_emit_cf_list_ureg(c, &impl->body);
3055
3056 ralloc_free(c->liveness);
3057 c->liveness = NULL;
3058
3059 }
3060
3061 static int
type_size(const struct glsl_type * type,bool bindless)3062 type_size(const struct glsl_type *type, bool bindless)
3063 {
3064 return glsl_count_attribute_slots(type, false);
3065 }
3066
3067 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3068 * can handle for 64-bit values in TGSI.
3069 */
3070 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3071 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3072 {
3073 if (instr->type != nir_instr_type_alu)
3074 return 0;
3075
3076 nir_alu_instr *alu = nir_instr_as_alu(instr);
3077
3078 switch (alu->op) {
3079 case nir_op_ibitfield_extract:
3080 case nir_op_ubitfield_extract:
3081 case nir_op_bitfield_insert:
3082 /* virglrenderer only looks at the .x channel of the offset/bits operands
3083 * when translating to GLSL. tgsi.rst doesn't seem to require scalar
3084 * offset/bits operands.
3085 *
3086 * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3087 */
3088 return 1;
3089
3090 default:
3091 break;
3092 }
3093
3094 int src_bit_size = nir_src_bit_size(alu->src[0].src);
3095 int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
3096
3097 if (src_bit_size == 64 || dst_bit_size == 64) {
3098 /* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst
3099 * claiming support, virglrenderer generates bad shaders on the host when
3100 * presented with them. Maybe we can make virgl avoid tickling the
3101 * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3102 * 64-bit instrs in the first place, I don't see much reason to care about
3103 * this.
3104 */
3105 return 1;
3106 }
3107
3108 return 4;
3109 }
3110
3111 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3112 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3113 unsigned num_components, unsigned high_offset,
3114 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3115 void *data)
3116 {
3117 if (bit_size != 32)
3118 return false;
3119
3120 /* Our offset alignment should aways be at least 4 bytes */
3121 if (align < 4)
3122 return false;
3123
3124 /* No wrapping off the end of a TGSI reg. We could do a bit better by
3125 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
3126 * need this restriction.
3127 */
3128 unsigned worst_start_component = align == 4 ? 3 : align / 4;
3129 if (worst_start_component + num_components > 4)
3130 return false;
3131
3132 return true;
3133 }
3134
3135 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3136 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3137 {
3138 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3139 unsigned indirect_mask = 0;
3140
3141 if (!screen->get_shader_param(screen, pipe_stage,
3142 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3143 indirect_mask |= nir_var_shader_in;
3144 }
3145
3146 if (!screen->get_shader_param(screen, pipe_stage,
3147 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3148 indirect_mask |= nir_var_shader_out;
3149 }
3150
3151 if (!screen->get_shader_param(screen, pipe_stage,
3152 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3153 indirect_mask |= nir_var_function_temp;
3154 }
3155
3156 return indirect_mask;
3157 }
3158
3159 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)3160 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
3161 {
3162 bool progress;
3163 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3164 unsigned control_flow_depth =
3165 screen->get_shader_param(screen, pipe_stage,
3166 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3167 do {
3168 progress = false;
3169
3170 NIR_PASS_V(s, nir_lower_vars_to_ssa);
3171 NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3172
3173 NIR_PASS(progress, s, nir_copy_prop);
3174 NIR_PASS(progress, s, nir_opt_algebraic);
3175 NIR_PASS(progress, s, nir_opt_constant_folding);
3176 NIR_PASS(progress, s, nir_opt_remove_phis);
3177 NIR_PASS(progress, s, nir_opt_conditional_discard);
3178 NIR_PASS(progress, s, nir_opt_dce);
3179 NIR_PASS(progress, s, nir_opt_dead_cf);
3180 NIR_PASS(progress, s, nir_opt_cse);
3181 NIR_PASS(progress, s, nir_opt_find_array_copies);
3182 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3183 NIR_PASS(progress, s, nir_opt_dead_write_vars);
3184
3185 NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false);
3186 NIR_PASS(progress, s, nir_opt_peephole_select,
3187 control_flow_depth == 0 ? ~0 : 8, true, true);
3188 NIR_PASS(progress, s, nir_opt_algebraic);
3189 NIR_PASS(progress, s, nir_opt_constant_folding);
3190 nir_load_store_vectorize_options vectorize_opts = {
3191 .modes = nir_var_mem_ubo,
3192 .callback = ntt_should_vectorize_io,
3193 .robust_modes = 0,
3194 };
3195 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3196 NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3197 NIR_PASS(progress, s, nir_opt_shrink_vectors);
3198 NIR_PASS(progress, s, nir_opt_trivial_continues);
3199 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3200 NIR_PASS(progress, s, nir_opt_undef);
3201 NIR_PASS(progress, s, nir_opt_loop_unroll);
3202
3203 /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3204 * and ALU ops for it.
3205 */
3206 static const nir_opt_offsets_options offset_options = {
3207 .ubo_vec4_max = ~0,
3208
3209 /* No const offset in TGSI for shared accesses. */
3210 .shared_max = 0,
3211
3212 /* unused intrinsics */
3213 .uniform_max = 0,
3214 .buffer_max = 0,
3215 };
3216 NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3217 } while (progress);
3218
3219 NIR_PASS_V(s, nir_lower_var_copies);
3220 }
3221
3222 /* Scalarizes all 64-bit ALU ops. Note that we only actually need to
3223 * scalarize vec3/vec4s, should probably fix that.
3224 */
3225 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3226 scalarize_64bit(const nir_instr *instr, const void *data)
3227 {
3228 const nir_alu_instr *alu = nir_instr_as_alu(instr);
3229
3230 return (nir_dest_bit_size(alu->dest.dest) == 64 ||
3231 nir_src_bit_size(alu->src[0].src) == 64);
3232 }
3233
3234 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3235 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3236 {
3237 b->cursor = nir_after_instr(&instr->instr);
3238
3239 switch (instr->intrinsic) {
3240 case nir_intrinsic_load_ubo:
3241 case nir_intrinsic_load_ubo_vec4:
3242 case nir_intrinsic_load_ssbo:
3243 case nir_intrinsic_load_input:
3244 case nir_intrinsic_load_interpolated_input:
3245 case nir_intrinsic_load_per_vertex_input:
3246 case nir_intrinsic_store_output:
3247 case nir_intrinsic_store_per_vertex_output:
3248 case nir_intrinsic_store_ssbo:
3249 break;
3250 default:
3251 return false;
3252 }
3253
3254 if (instr->num_components <= 2)
3255 return false;
3256
3257 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3258 if (has_dest) {
3259 if (nir_dest_bit_size(instr->dest) != 64)
3260 return false;
3261 } else {
3262 if (nir_src_bit_size(instr->src[0]) != 64)
3263 return false;
3264 }
3265
3266 nir_intrinsic_instr *first =
3267 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3268 nir_intrinsic_instr *second =
3269 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3270
3271 switch (instr->intrinsic) {
3272 case nir_intrinsic_load_ubo:
3273 case nir_intrinsic_load_ubo_vec4:
3274 case nir_intrinsic_load_ssbo:
3275 case nir_intrinsic_store_ssbo:
3276 break;
3277
3278 default: {
3279 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3280 semantics.location++;
3281 semantics.num_slots--;
3282 nir_intrinsic_set_io_semantics(second, semantics);
3283
3284 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3285 break;
3286 }
3287 }
3288
3289 first->num_components = 2;
3290 second->num_components -= 2;
3291 if (has_dest) {
3292 first->dest.ssa.num_components = 2;
3293 second->dest.ssa.num_components -= 2;
3294 }
3295
3296 nir_builder_instr_insert(b, &first->instr);
3297 nir_builder_instr_insert(b, &second->instr);
3298
3299 if (has_dest) {
3300 /* Merge the two loads' results back into a vector. */
3301 nir_ssa_scalar channels[4] = {
3302 nir_get_ssa_scalar(&first->dest.ssa, 0),
3303 nir_get_ssa_scalar(&first->dest.ssa, 1),
3304 nir_get_ssa_scalar(&second->dest.ssa, 0),
3305 nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
3306 };
3307 nir_ssa_def *new = nir_vec_scalars(b, channels, instr->num_components);
3308 nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
3309 } else {
3310 /* Split the src value across the two stores. */
3311 b->cursor = nir_before_instr(&instr->instr);
3312
3313 nir_ssa_def *src0 = instr->src[0].ssa;
3314 nir_ssa_scalar channels[4] = { 0 };
3315 for (int i = 0; i < instr->num_components; i++)
3316 channels[i] = nir_get_ssa_scalar(src0, i);
3317
3318 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3319 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3320
3321 nir_instr_rewrite_src(&first->instr, &first->src[0],
3322 nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
3323 nir_instr_rewrite_src(&second->instr, &second->src[0],
3324 nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
3325 second->num_components)));
3326 }
3327
3328 int offset_src = -1;
3329 uint32_t offset_amount = 16;
3330
3331 switch (instr->intrinsic) {
3332 case nir_intrinsic_load_ssbo:
3333 case nir_intrinsic_load_ubo:
3334 offset_src = 1;
3335 break;
3336 case nir_intrinsic_load_ubo_vec4:
3337 offset_src = 1;
3338 offset_amount = 1;
3339 break;
3340 case nir_intrinsic_store_ssbo:
3341 offset_src = 2;
3342 break;
3343 default:
3344 break;
3345 }
3346 if (offset_src != -1) {
3347 b->cursor = nir_before_instr(&second->instr);
3348 nir_ssa_def *second_offset =
3349 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3350 nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
3351 nir_src_for_ssa(second_offset));
3352 }
3353
3354 /* DCE stores we generated with no writemask (nothing else does this
3355 * currently).
3356 */
3357 if (!has_dest) {
3358 if (nir_intrinsic_write_mask(first) == 0)
3359 nir_instr_remove(&first->instr);
3360 if (nir_intrinsic_write_mask(second) == 0)
3361 nir_instr_remove(&second->instr);
3362 }
3363
3364 nir_instr_remove(&instr->instr);
3365
3366 return true;
3367 }
3368
3369 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3370 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3371 {
3372 int num_components = instr->def.num_components;
3373
3374 if (instr->def.bit_size != 64 || num_components <= 2)
3375 return false;
3376
3377 b->cursor = nir_before_instr(&instr->instr);
3378
3379 nir_load_const_instr *first =
3380 nir_load_const_instr_create(b->shader, 2, 64);
3381 nir_load_const_instr *second =
3382 nir_load_const_instr_create(b->shader, num_components - 2, 64);
3383
3384 first->value[0] = instr->value[0];
3385 first->value[1] = instr->value[1];
3386 second->value[0] = instr->value[2];
3387 if (num_components == 4)
3388 second->value[1] = instr->value[3];
3389
3390 nir_builder_instr_insert(b, &first->instr);
3391 nir_builder_instr_insert(b, &second->instr);
3392
3393 nir_ssa_def *channels[4] = {
3394 nir_channel(b, &first->def, 0),
3395 nir_channel(b, &first->def, 1),
3396 nir_channel(b, &second->def, 0),
3397 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3398 };
3399 nir_ssa_def *new = nir_vec(b, channels, num_components);
3400 nir_ssa_def_rewrite_uses(&instr->def, new);
3401 nir_instr_remove(&instr->instr);
3402
3403 return true;
3404 }
3405
3406 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3407 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3408 void *data)
3409 {
3410 switch (instr->type) {
3411 case nir_instr_type_load_const:
3412 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3413
3414 case nir_instr_type_intrinsic:
3415 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3416 default:
3417 return false;
3418 }
3419 }
3420
3421 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3422 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3423 {
3424 return nir_shader_instructions_pass(s,
3425 nir_to_tgsi_lower_64bit_to_vec2_instr,
3426 nir_metadata_block_index |
3427 nir_metadata_dominance,
3428 NULL);
3429 }
3430
3431 struct ntt_lower_tex_state {
3432 nir_ssa_scalar channels[8];
3433 unsigned i;
3434 };
3435
3436 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3437 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3438 nir_tex_instr *instr,
3439 nir_tex_src_type tex_src_type,
3440 struct ntt_lower_tex_state *s)
3441 {
3442 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3443 if (tex_src < 0)
3444 return;
3445
3446 assert(instr->src[tex_src].src.is_ssa);
3447
3448 nir_ssa_def *def = instr->src[tex_src].src.ssa;
3449 for (int i = 0; i < def->num_components; i++) {
3450 s->channels[s->i++] = nir_get_ssa_scalar(def, i);
3451 }
3452
3453 nir_tex_instr_remove_src(instr, tex_src);
3454 }
3455
3456 /**
3457 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3458 * src. This lets NIR handle the coalescing of the vec4 rather than trying to
3459 * manage it on our own, and may lead to more vectorization.
3460 */
3461 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3462 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3463 {
3464 if (instr->type != nir_instr_type_tex)
3465 return false;
3466
3467 nir_tex_instr *tex = nir_instr_as_tex(instr);
3468
3469 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3470 return false;
3471
3472 b->cursor = nir_before_instr(instr);
3473
3474 struct ntt_lower_tex_state s = {0};
3475
3476 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3477 /* We always have at least two slots for the coordinate, even on 1D. */
3478 s.i = MAX2(s.i, 2);
3479
3480 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3481 s.i = MAX2(s.i, 3);
3482
3483 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3484
3485 /* XXX: LZ */
3486 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3487 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3488 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3489
3490 /* No need to pack undefs in unused channels of the tex instr */
3491 while (!s.channels[s.i - 1].def)
3492 s.i--;
3493
3494 /* Instead of putting undefs in the unused slots of the vecs, just put in
3495 * another used channel. Otherwise, we'll get unnecessary moves into
3496 * registers.
3497 */
3498 assert(s.channels[0].def != NULL);
3499 for (int i = 1; i < s.i; i++) {
3500 if (!s.channels[i].def)
3501 s.channels[i] = s.channels[0];
3502 }
3503
3504 nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec_scalars(b, s.channels, MIN2(s.i, 4))));
3505 if (s.i > 4)
3506 nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec_scalars(b, &s.channels[4], s.i - 4)));
3507
3508 return true;
3509 }
3510
3511 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3512 nir_to_tgsi_lower_tex(nir_shader *s)
3513 {
3514 return nir_shader_instructions_pass(s,
3515 nir_to_tgsi_lower_tex_instr,
3516 nir_metadata_block_index |
3517 nir_metadata_dominance,
3518 NULL);
3519 }
3520
3521 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3522 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3523 const struct nir_to_tgsi_options *ntt_options)
3524 {
3525 const struct nir_shader_compiler_options *options = s->options;
3526 bool lower_fsqrt =
3527 !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3528 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3529
3530 bool force_indirect_unrolling_sampler =
3531 screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3532
3533 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3534
3535 if (!options->lower_extract_byte ||
3536 !options->lower_extract_word ||
3537 !options->lower_insert_byte ||
3538 !options->lower_insert_word ||
3539 !options->lower_fdph ||
3540 !options->lower_flrp64 ||
3541 !options->lower_fmod ||
3542 !options->lower_rotate ||
3543 !options->lower_uadd_sat ||
3544 !options->lower_usub_sat ||
3545 !options->lower_uniforms_to_ubo ||
3546 !options->lower_vector_cmp ||
3547 options->lower_fsqrt != lower_fsqrt ||
3548 options->force_indirect_unrolling != no_indirects_mask ||
3549 force_indirect_unrolling_sampler) {
3550 nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3551 *new_options = *s->options;
3552
3553 new_options->lower_extract_byte = true;
3554 new_options->lower_extract_word = true;
3555 new_options->lower_insert_byte = true;
3556 new_options->lower_insert_word = true;
3557 new_options->lower_fdph = true;
3558 new_options->lower_flrp64 = true;
3559 new_options->lower_fmod = true;
3560 new_options->lower_rotate = true;
3561 new_options->lower_uadd_sat = true;
3562 new_options->lower_usub_sat = true;
3563 new_options->lower_uniforms_to_ubo = true;
3564 new_options->lower_vector_cmp = true;
3565 new_options->lower_fsqrt = lower_fsqrt;
3566 new_options->force_indirect_unrolling = no_indirects_mask;
3567 new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3568
3569 s->options = new_options;
3570 }
3571 }
3572
3573 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3574 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3575 {
3576 return (instr->type == nir_instr_type_intrinsic &&
3577 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3578 }
3579
3580 static nir_ssa_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3581 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3582 {
3583 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3584
3585 nir_ssa_def *old_result = &intr->dest.ssa;
3586 intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3587
3588 return nir_iadd_imm(b, old_result, -1);
3589 }
3590
3591 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3592 ntt_lower_atomic_pre_dec(nir_shader *s)
3593 {
3594 return nir_shader_lower_instructions(s,
3595 ntt_lower_atomic_pre_dec_filter,
3596 ntt_lower_atomic_pre_dec_lower, NULL);
3597 }
3598
3599 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3600 static void
nir_to_tgsi_lower_txp(nir_shader * s)3601 nir_to_tgsi_lower_txp(nir_shader *s)
3602 {
3603 nir_lower_tex_options lower_tex_options = {
3604 .lower_txp = 0,
3605 };
3606
3607 nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3608 nir_foreach_instr(instr, block) {
3609 if (instr->type != nir_instr_type_tex)
3610 continue;
3611 nir_tex_instr *tex = nir_instr_as_tex(instr);
3612
3613 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3614 continue;
3615
3616 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3617 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3618 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3619
3620 /* We can do TXP for any tex (not txg) where we can fit all the
3621 * coordinates and comparator and projector in one vec4 without any
3622 * other modifiers to add on.
3623 *
3624 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3625 * if we get any funny projectors then we just blow them all away.
3626 */
3627 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3628 lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3629 }
3630 }
3631
3632 /* nir_lower_tex must be run even if no options are set, because we need the
3633 * LOD to be set for query_levels and for non-fragment shaders.
3634 */
3635 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3636 }
3637
3638 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3639 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3640 {
3641 return (instr->type == nir_instr_type_intrinsic &&
3642 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3643 }
3644
3645 static nir_ssa_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3646 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3647 {
3648 nir_variable *var = *(nir_variable **)data;
3649 if (!var) {
3650 var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3651 var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3652 b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3653 var->data.driver_location = b->shader->num_inputs++;
3654
3655 *(nir_variable **)data = var;
3656 }
3657
3658 nir_io_semantics semantics = {
3659 .location = var->data.location,
3660 .num_slots = 1
3661 };
3662 return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3663 .base = var->data.driver_location,
3664 .io_semantics = semantics);
3665 }
3666
3667 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3668 nir_lower_primid_sysval_to_input(nir_shader *s)
3669 {
3670 nir_variable *input = NULL;
3671
3672 return nir_shader_lower_instructions(s,
3673 nir_lower_primid_sysval_to_input_filter,
3674 nir_lower_primid_sysval_to_input_lower, &input);
3675 }
3676
3677 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3678 nir_to_tgsi(struct nir_shader *s,
3679 struct pipe_screen *screen)
3680 {
3681 static const struct nir_to_tgsi_options default_ntt_options = {0};
3682 return nir_to_tgsi_options(s, screen, &default_ntt_options);
3683 }
3684
3685 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3686 * into unsupported channels of registers.
3687 */
3688 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3689 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3690 {
3691 if (instr->type != nir_instr_type_alu)
3692 return false;
3693
3694 nir_alu_instr *alu = nir_instr_as_alu(instr);
3695 int dst_32 = nir_dest_bit_size(alu->dest.dest) == 32;
3696 int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3697
3698 if (src_64 && dst_32) {
3699 int num_srcs = nir_op_infos[alu->op].num_inputs;
3700
3701 if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3702 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3703 * instead of .xy. Just support scalar compares storing to .x,
3704 * GLSL-to-TGSI only ever emitted scalar ops anyway.
3705 */
3706 if (writemask != TGSI_WRITEMASK_X)
3707 return false;
3708 } else {
3709 /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3710 * register can only store a dvec2). Don't try to coalesce to write to
3711 * .zw.
3712 */
3713 if (writemask & ~(TGSI_WRITEMASK_XY))
3714 return false;
3715 }
3716 }
3717
3718 return true;
3719 }
3720
3721 /**
3722 * Translates the NIR shader to TGSI.
3723 *
3724 * This requires some lowering of the NIR shader to prepare it for translation.
3725 * We take ownership of the NIR shader passed, returning a reference to the new
3726 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
3727 */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3728 const void *nir_to_tgsi_options(struct nir_shader *s,
3729 struct pipe_screen *screen,
3730 const struct nir_to_tgsi_options *options)
3731 {
3732 struct ntt_compile *c;
3733 const void *tgsi_tokens;
3734 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3735 bool native_integers = screen->get_shader_param(screen,
3736 pipe_shader_type_from_mesa(s->info.stage),
3737 PIPE_SHADER_CAP_INTEGERS);
3738 const struct nir_shader_compiler_options *original_options = s->options;
3739
3740 ntt_fix_nir_options(screen, s, options);
3741
3742 /* Lower array indexing on FS inputs. Since we don't set
3743 * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3744 * elements by ureg, and so dynamically indexing them would be invalid.
3745 * Ideally we would set that ureg flag based on
3746 * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3747 * splitting NIR VS outputs to elements even if the FS doesn't get the
3748 * corresponding splitting, and virgl depends on TGSI across link boundaries
3749 * having matching declarations.
3750 */
3751 if (s->info.stage == MESA_SHADER_FRAGMENT) {
3752 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3753 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3754 }
3755
3756 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3757 type_size, (nir_lower_io_options)0);
3758 NIR_PASS_V(s, nir_lower_regs_to_ssa);
3759
3760 nir_to_tgsi_lower_txp(s);
3761 NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3762
3763 /* While TGSI can represent PRIMID as either an input or a system value,
3764 * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3765 * depend on that.
3766 */
3767 if (s->info.stage == MESA_SHADER_GEOMETRY)
3768 NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3769
3770 if (s->info.num_abos)
3771 NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3772
3773 if (!original_options->lower_uniforms_to_ubo) {
3774 NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3775 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3776 !native_integers);
3777 }
3778
3779 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3780 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3781 * duplication logic we just make it so that we only see vec2s.
3782 */
3783 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3784 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3785
3786 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3787 NIR_PASS_V(s, nir_lower_ubo_vec4);
3788
3789 ntt_optimize_nir(s, screen);
3790
3791 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3792
3793 bool progress;
3794 do {
3795 progress = false;
3796 NIR_PASS(progress, s, nir_opt_algebraic_late);
3797 if (progress) {
3798 NIR_PASS_V(s, nir_copy_prop);
3799 NIR_PASS_V(s, nir_opt_dce);
3800 NIR_PASS_V(s, nir_opt_cse);
3801 }
3802 } while (progress);
3803
3804 if (screen->get_shader_param(screen,
3805 pipe_shader_type_from_mesa(s->info.stage),
3806 PIPE_SHADER_CAP_INTEGERS)) {
3807 NIR_PASS_V(s, nir_lower_bool_to_int32);
3808 } else {
3809 NIR_PASS_V(s, nir_lower_int_to_float);
3810 NIR_PASS_V(s, nir_lower_bool_to_float);
3811 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3812 NIR_PASS_V(s, nir_copy_prop);
3813 NIR_PASS_V(s, nir_opt_dce);
3814 }
3815
3816 nir_move_options move_all =
3817 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3818 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3819
3820 NIR_PASS_V(s, nir_opt_move, move_all);
3821
3822 /* Only lower 32-bit floats. The only other modifier type officially
3823 * supported by TGSI is 32-bit integer negates, but even those are broken on
3824 * virglrenderer, so skip lowering all integer and f64 float mods.
3825 *
3826 * The options->lower_fabs requests that we not have native source modifiers
3827 * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
3828 */
3829 nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods;
3830 if (!options->lower_fabs)
3831 source_mods |= nir_lower_fabs_source_mods;
3832 NIR_PASS_V(s, nir_lower_to_source_mods, source_mods);
3833
3834 NIR_PASS_V(s, nir_convert_from_ssa, true);
3835 NIR_PASS_V(s, nir_lower_vec_to_movs, ntt_vec_to_mov_writemask_cb, NULL);
3836
3837 /* locals_to_regs will leave dead derefs that are good to clean up. */
3838 NIR_PASS_V(s, nir_lower_locals_to_regs);
3839 NIR_PASS_V(s, nir_opt_dce);
3840
3841 if (NIR_DEBUG(TGSI)) {
3842 fprintf(stderr, "NIR before translation to TGSI:\n");
3843 nir_print_shader(s, stderr);
3844 }
3845
3846 c = rzalloc(NULL, struct ntt_compile);
3847 c->screen = screen;
3848 c->options = options;
3849
3850 c->needs_texcoord_semantic =
3851 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3852 c->has_txf_lz =
3853 screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3854
3855 c->s = s;
3856 c->native_integers = native_integers;
3857 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3858 ureg_setup_shader_info(c->ureg, &s->info);
3859 if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
3860 ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
3861
3862 if (s->info.stage == MESA_SHADER_FRAGMENT) {
3863 /* The draw module's polygon stipple layer doesn't respect the chosen
3864 * coordinate mode, so leave it as unspecified unless we're actually
3865 * reading the position in the shader already. See
3866 * gl-2.1-polygon-stipple-fs on softpipe.
3867 */
3868 if ((s->info.inputs_read & VARYING_BIT_POS) ||
3869 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3870 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3871 s->info.fs.origin_upper_left ?
3872 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3873 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3874
3875 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3876 s->info.fs.pixel_center_integer ?
3877 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3878 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3879 }
3880 }
3881 /* Emit the main function */
3882 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3883 ntt_emit_impl(c, impl);
3884 ureg_END(c->ureg);
3885
3886 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3887
3888 if (NIR_DEBUG(TGSI)) {
3889 fprintf(stderr, "TGSI after translation from NIR:\n");
3890 tgsi_dump(tgsi_tokens, 0);
3891 }
3892
3893 ureg_destroy(c->ureg);
3894
3895 ralloc_free(c);
3896 ralloc_free(s);
3897
3898 return tgsi_tokens;
3899 }
3900
3901 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3902 .fdot_replicates = true,
3903 .fuse_ffma32 = true,
3904 .fuse_ffma64 = true,
3905 .lower_extract_byte = true,
3906 .lower_extract_word = true,
3907 .lower_insert_byte = true,
3908 .lower_insert_word = true,
3909 .lower_fdph = true,
3910 .lower_flrp64 = true,
3911 .lower_fmod = true,
3912 .lower_rotate = true,
3913 .lower_uniforms_to_ubo = true,
3914 .lower_uadd_sat = true,
3915 .lower_usub_sat = true,
3916 .lower_vector_cmp = true,
3917 .lower_int64_options = nir_lower_imul_2x32_64,
3918 .use_interpolated_input_intrinsics = true,
3919 };
3920
3921 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
3922 * NIR support.
3923 */
3924 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)3925 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3926 enum pipe_shader_ir ir,
3927 unsigned shader)
3928 {
3929 assert(ir == PIPE_SHADER_IR_NIR);
3930 return &nir_to_tgsi_compiler_options;
3931 }
3932
3933 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
3934 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)3935 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
3936 const struct pipe_shader_state *cso)
3937 {
3938 if (cso->type == PIPE_SHADER_IR_NIR) {
3939 return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
3940 } else {
3941 assert(cso->type == PIPE_SHADER_IR_TGSI);
3942 /* we need to keep a local copy of the tokens */
3943 return tgsi_dup_tokens(cso->tokens);
3944 }
3945 }
3946