1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir_to_rc.h"
29 #include "r300_nir.h"
30 #include "r300_screen.h"
31 #include "pipe/p_screen.h"
32 #include "pipe/p_state.h"
33 #include "tgsi/tgsi_dump.h"
34 #include "tgsi/tgsi_from_mesa.h"
35 #include "tgsi/tgsi_info.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_ureg.h"
38 #include "tgsi/tgsi_util.h"
39 #include "util/u_debug.h"
40 #include "util/u_math.h"
41 #include "util/u_memory.h"
42 #include "util/u_dynarray.h"
43
44 struct ntr_insn {
45 enum tgsi_opcode opcode;
46 struct ureg_dst dst[2];
47 struct ureg_src src[4];
48 enum tgsi_texture_type tex_target;
49 enum tgsi_return_type tex_return_type;
50 struct tgsi_texture_offset tex_offset[4];
51
52 unsigned mem_qualifier;
53 enum pipe_format mem_format;
54
55 bool is_tex : 1;
56 bool precise : 1;
57 };
58
59 struct ntr_block {
60 /* Array of struct ntr_insn */
61 struct util_dynarray insns;
62 int start_ip;
63 int end_ip;
64 };
65
66 struct ntr_reg_interval {
67 uint32_t start, end;
68 };
69
70 struct ntr_compile {
71 nir_shader *s;
72 nir_function_impl *impl;
73 const struct nir_to_rc_options *options;
74 struct pipe_screen *screen;
75 struct ureg_program *ureg;
76
77 bool addr_declared[3];
78 struct ureg_dst addr_reg[3];
79
80 /* if condition set up at the end of a block, for ntr_emit_if(). */
81 struct ureg_src if_cond;
82
83 /* TGSI temps for our NIR SSA and register values. */
84 struct ureg_dst *reg_temp;
85 struct ureg_src *ssa_temp;
86
87 struct ntr_reg_interval *liveness;
88
89 /* Map from nir_block to ntr_block */
90 struct hash_table *blocks;
91 struct ntr_block *cur_block;
92 unsigned current_if_else;
93 unsigned cf_label;
94
95 /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
96 bool precise;
97
98 unsigned num_temps;
99 unsigned first_non_array_temp;
100
101 /* Mappings from driver_location to TGSI input/output number.
102 *
103 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
104 * their numbers assigned incrementally, unlike inputs or constants.
105 */
106 struct ureg_src *input_index_map;
107 uint64_t centroid_inputs;
108
109 uint32_t first_ubo;
110 };
111
112 static struct ureg_dst
ntr_temp(struct ntr_compile * c)113 ntr_temp(struct ntr_compile *c)
114 {
115 return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
116 }
117
118 static struct ntr_block *
ntr_block_from_nir(struct ntr_compile * c,struct nir_block * block)119 ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block)
120 {
121 struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
122 return entry->data;
123 }
124
125 static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list);
126 static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list);
127
128 static struct ntr_insn *
ntr_insn(struct ntr_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)129 ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode,
130 struct ureg_dst dst,
131 struct ureg_src src0, struct ureg_src src1,
132 struct ureg_src src2, struct ureg_src src3)
133 {
134 struct ntr_insn insn = {
135 .opcode = opcode,
136 .dst = { dst, ureg_dst_undef() },
137 .src = { src0, src1, src2, src3 },
138 .precise = c->precise,
139 };
140 util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn);
141 return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn);
142 }
143
144 #define OP00( op ) \
145 static inline void ntr_##op(struct ntr_compile *c) \
146 { \
147 ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
148 }
149
150 #define OP01( op ) \
151 static inline void ntr_##op(struct ntr_compile *c, \
152 struct ureg_src src0) \
153 { \
154 ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
155 }
156
157
158 #define OP10( op ) \
159 static inline void ntr_##op(struct ntr_compile *c, \
160 struct ureg_dst dst) \
161 { \
162 ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
163 }
164
165 #define OP11( op ) \
166 static inline void ntr_##op(struct ntr_compile *c, \
167 struct ureg_dst dst, \
168 struct ureg_src src0) \
169 { \
170 ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
171 }
172
173 #define OP12( op ) \
174 static inline void ntr_##op(struct ntr_compile *c, \
175 struct ureg_dst dst, \
176 struct ureg_src src0, \
177 struct ureg_src src1) \
178 { \
179 ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
180 }
181
182 #define OP13( op ) \
183 static inline void ntr_##op(struct ntr_compile *c, \
184 struct ureg_dst dst, \
185 struct ureg_src src0, \
186 struct ureg_src src1, \
187 struct ureg_src src2) \
188 { \
189 ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
190 }
191
192 #define OP14( op ) \
193 static inline void ntr_##op(struct ntr_compile *c, \
194 struct ureg_dst dst, \
195 struct ureg_src src0, \
196 struct ureg_src src1, \
197 struct ureg_src src2, \
198 struct ureg_src src3) \
199 { \
200 ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
201 }
202
203 /* We hand-craft our tex instructions */
204 #define OP12_TEX(op)
205 #define OP14_TEX(op)
206
207 /* Use a template include to generate a correctly-typed ntr_OP()
208 * function for each TGSI opcode:
209 */
210 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
211
212 /**
213 * Interprets a nir_load_const used as a NIR src as a uint.
214 *
215 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
216 * instruction (or in a phi-web used by an integer ALU instruction) were
217 * converted to floats and the ALU instruction swapped to the float equivalent.
218 * However, this means that integer load_consts used by intrinsics (which don't
219 * normally get that conversion) may have been reformatted to be floats. Given
220 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
221 * we can just look and see if they look like floats and convert them back to
222 * ints.
223 */
224 static uint32_t
ntr_src_as_uint(struct ntr_compile * c,nir_src src)225 ntr_src_as_uint(struct ntr_compile *c, nir_src src)
226 {
227 uint32_t val = nir_src_as_uint(src);
228 if (val >= fui(1.0))
229 val = (uint32_t)uif(val);
230 return val;
231 }
232
233 /* Per-channel masks of def/use within the block, and the per-channel
234 * livein/liveout for the block as a whole.
235 */
236 struct ntr_live_reg_block_state {
237 uint8_t *def, *use, *livein, *liveout, *defin, *defout;
238 };
239
240 struct ntr_live_reg_state {
241 unsigned bitset_words;
242
243 struct ntr_reg_interval *regs;
244
245 /* Used in propagate_across_edge() */
246 BITSET_WORD *tmp_live;
247
248 struct ntr_live_reg_block_state *blocks;
249
250 nir_block_worklist worklist;
251 };
252
253 static void
ntr_live_reg_mark_use(struct ntr_compile * c,struct ntr_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)254 ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs,
255 int ip, unsigned index, unsigned used_mask)
256 {
257 bs->use[index] |= used_mask & ~bs->def[index];
258
259 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
260 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
261
262 }
263 static void
ntr_live_reg_setup_def_use(struct ntr_compile * c,nir_function_impl * impl,struct ntr_live_reg_state * state)264 ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl, struct ntr_live_reg_state *state)
265 {
266 for (int i = 0; i < impl->num_blocks; i++) {
267 state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
268 state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
269 state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
270 state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
271 state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
272 state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
273 }
274
275 int ip = 0;
276 nir_foreach_block(block, impl) {
277 struct ntr_live_reg_block_state *bs = &state->blocks[block->index];
278 struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
279
280 ntr_block->start_ip = ip;
281
282 util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
283 const struct tgsi_opcode_info *opcode_info =
284 tgsi_get_opcode_info(insn->opcode);
285
286 /* Set up use[] for the srcs.
287 *
288 * Uses are the channels of the reg read in the block that don't have a
289 * preceding def to screen them off. Note that we don't do per-element
290 * tracking of array regs, so they're never screened off.
291 */
292 for (int i = 0; i < opcode_info->num_src; i++) {
293 if (insn->src[i].File != TGSI_FILE_TEMPORARY)
294 continue;
295 int index = insn->src[i].Index;
296
297 uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
298 insn->dst->WriteMask,
299 insn->src[i].SwizzleX,
300 insn->src[i].SwizzleY,
301 insn->src[i].SwizzleZ,
302 insn->src[i].SwizzleW,
303 insn->tex_target,
304 insn->tex_target);
305
306 assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
307 ntr_live_reg_mark_use(c, bs, ip, index, used_mask);
308 }
309
310 if (insn->is_tex) {
311 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
312 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
313 ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
314 }
315 }
316
317 /* Set up def[] for the srcs.
318 *
319 * Defs are the unconditionally-written (not R/M/W) channels of the reg in
320 * the block that don't have a preceding use.
321 */
322 for (int i = 0; i < opcode_info->num_dst; i++) {
323 if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
324 continue;
325 int index = insn->dst[i].Index;
326 uint32_t writemask = insn->dst[i].WriteMask;
327
328 bs->def[index] |= writemask & ~bs->use[index];
329 bs->defout[index] |= writemask;
330
331 assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
332 c->liveness[index].start = MIN2(c->liveness[index].start, ip);
333 c->liveness[index].end = MAX2(c->liveness[index].end, ip);
334 }
335 ip++;
336 }
337
338 ntr_block->end_ip = ip;
339 }
340 }
341
342 static void
ntr_live_regs(struct ntr_compile * c,nir_function_impl * impl)343 ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl)
344 {
345 nir_metadata_require(impl, nir_metadata_block_index);
346
347 c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps);
348
349 struct ntr_live_reg_state state = {
350 .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks),
351 };
352
353 /* The intervals start out with start > end (indicating unused) */
354 for (int i = 0; i < c->num_temps; i++)
355 c->liveness[i].start = ~0;
356
357 ntr_live_reg_setup_def_use(c, impl, &state);
358
359 /* Make a forward-order worklist of all the blocks. */
360 nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
361 nir_foreach_block(block, impl) {
362 nir_block_worklist_push_tail(&state.worklist, block);
363 }
364
365 /* Propagate defin/defout down the CFG to calculate the live variables
366 * potentially defined along any possible control flow path. We'll use this
367 * to keep things like conditional defs of the reg (or array regs where we
368 * don't track defs!) from making the reg's live range extend back to the
369 * start of the program.
370 */
371 while (!nir_block_worklist_is_empty(&state.worklist)) {
372 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
373 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
374 nir_block *succ = block->successors[j];
375 if (!succ || succ->index == impl->num_blocks)
376 continue;
377
378 for (int i = 0; i < c->num_temps; i++) {
379 uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
380
381 if (new_def) {
382 state.blocks[succ->index].defin[i] |= new_def;
383 state.blocks[succ->index].defout[i] |= new_def;
384 nir_block_worklist_push_tail(&state.worklist, succ);
385 }
386 }
387 }
388 }
389
390 /* Make a reverse-order worklist of all the blocks. */
391 nir_foreach_block(block, impl) {
392 nir_block_worklist_push_head(&state.worklist, block);
393 }
394
395 /* We're now ready to work through the worklist and update the liveness sets
396 * of each of the blocks. As long as we keep the worklist up-to-date as we
397 * go, everything will get covered.
398 */
399 while (!nir_block_worklist_is_empty(&state.worklist)) {
400 /* We pop them off in the reverse order we pushed them on. This way
401 * the first walk of the instructions is backwards so we only walk
402 * once in the case of no control flow.
403 */
404 nir_block *block = nir_block_worklist_pop_head(&state.worklist);
405 struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
406 struct ntr_live_reg_block_state *bs = &state.blocks[block->index];
407
408 for (int i = 0; i < c->num_temps; i++) {
409 /* Collect livein from our successors to include in our liveout. */
410 for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
411 nir_block *succ = block->successors[j];
412 if (!succ || succ->index == impl->num_blocks)
413 continue;
414 struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index];
415
416 uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
417 if (new_liveout) {
418 if (state.blocks[block->index].defout[i])
419 c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip);
420 bs->liveout[i] |= sbs->livein[i];
421 }
422 }
423
424 /* Propagate use requests from either our block's uses or our
425 * non-screened-off liveout up to our predecessors.
426 */
427 uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
428 ~bs->livein[i]);
429 if (new_livein) {
430 bs->livein[i] |= new_livein;
431 set_foreach(block->predecessors, entry) {
432 nir_block *pred = (void *)entry->key;
433 nir_block_worklist_push_tail(&state.worklist, pred);
434 }
435
436 if (new_livein & state.blocks[block->index].defin[i])
437 c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip);
438 }
439 }
440 }
441
442 ralloc_free(state.blocks);
443 nir_block_worklist_fini(&state.worklist);
444 }
445
446 static void
ntr_ra_check(struct ntr_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)447 ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
448 {
449 if (index < c->first_non_array_temp)
450 return;
451
452 if (c->liveness[index].start == ip && ra_map[index] == ~0)
453 ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
454
455 if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
456 ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
457 BITSET_SET(released, index);
458 }
459 }
460
461 static void
ntr_allocate_regs(struct ntr_compile * c,nir_function_impl * impl)462 ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl)
463 {
464 ntr_live_regs(c, impl);
465
466 unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
467 unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
468
469 /* No RA on NIR array regs */
470 for (int i = 0; i < c->first_non_array_temp; i++)
471 ra_map[i] = i;
472
473 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
474 ra_map[i] = ~0;
475
476 int ip = 0;
477 nir_foreach_block(block, impl) {
478 struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
479
480 for (int i = 0; i < c->num_temps; i++)
481 ntr_ra_check(c, ra_map, released, ip, i);
482
483 util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
484 const struct tgsi_opcode_info *opcode_info =
485 tgsi_get_opcode_info(insn->opcode);
486
487 for (int i = 0; i < opcode_info->num_src; i++) {
488 if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
489 ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index);
490 insn->src[i].Index = ra_map[insn->src[i].Index];
491 }
492 }
493
494 if (insn->is_tex) {
495 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
496 if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
497 ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
498 insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
499 }
500 }
501 }
502
503 for (int i = 0; i < opcode_info->num_dst; i++) {
504 if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
505 ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
506 insn->dst[i].Index = ra_map[insn->dst[i].Index];
507 }
508 }
509 ip++;
510 }
511
512 for (int i = 0; i < c->num_temps; i++)
513 ntr_ra_check(c, ra_map, released, ip, i);
514 }
515 }
516
517 static void
ntr_allocate_regs_unoptimized(struct ntr_compile * c,nir_function_impl * impl)518 ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl)
519 {
520 for (int i = c->first_non_array_temp; i < c->num_temps; i++)
521 ureg_DECL_temporary(c->ureg);
522 }
523
524 /* TGSI varying declarations have a component usage mask associated (used by
525 * r600 and svga).
526 */
527 static uint32_t
ntr_tgsi_var_usage_mask(const struct nir_variable * var)528 ntr_tgsi_var_usage_mask(const struct nir_variable *var)
529 {
530 const struct glsl_type *type_without_array =
531 glsl_without_array(var->type);
532 unsigned num_components = glsl_get_vector_elements(type_without_array);
533 if (num_components == 0) /* structs */
534 num_components = 4;
535
536 return u_bit_consecutive(var->data.location_frac, num_components);
537 }
538
539 static struct ureg_dst
ntr_output_decl(struct ntr_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)540 ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
541 {
542 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
543 int base = nir_intrinsic_base(instr);
544 *frac = nir_intrinsic_component(instr);
545
546 struct ureg_dst out;
547 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
548 unsigned semantic_name, semantic_index;
549 tgsi_get_gl_frag_result_semantic(semantics.location,
550 &semantic_name, &semantic_index);
551 semantic_index += semantics.dual_source_blend_index;
552
553 switch (semantics.location) {
554 case FRAG_RESULT_DEPTH:
555 *frac = 2; /* z write is the to the .z channel in TGSI */
556 break;
557 case FRAG_RESULT_STENCIL:
558 *frac = 1;
559 break;
560 default:
561 break;
562 }
563
564 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
565 } else {
566 unsigned semantic_name, semantic_index;
567
568 tgsi_get_gl_varying_semantic(semantics.location, true,
569 &semantic_name, &semantic_index);
570
571 uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components);
572 uint32_t gs_streams = semantics.gs_streams;
573 for (int i = 0; i < 4; i++) {
574 if (!(usage_mask & (1 << i)))
575 gs_streams &= ~(0x3 << 2 * i);
576 }
577
578 /* No driver appears to use array_id of outputs. */
579 unsigned array_id = 0;
580
581 /* This bit is lost in the i/o semantics, but it's unused in in-tree
582 * drivers.
583 */
584 bool invariant = semantics.invariant;
585
586 out = ureg_DECL_output_layout(c->ureg,
587 semantic_name, semantic_index,
588 gs_streams,
589 base,
590 usage_mask,
591 array_id,
592 semantics.num_slots,
593 invariant);
594 }
595
596 unsigned write_mask;
597 if (nir_intrinsic_has_write_mask(instr))
598 write_mask = nir_intrinsic_write_mask(instr);
599 else
600 write_mask = ((1 << instr->num_components) - 1) << *frac;
601
602 write_mask = write_mask << *frac;
603 return ureg_writemask(out, write_mask);
604 }
605
606 static bool
ntr_try_store_in_tgsi_output_with_use(struct ntr_compile * c,struct ureg_dst * dst,nir_src * src)607 ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c,
608 struct ureg_dst *dst,
609 nir_src *src)
610 {
611 *dst = ureg_dst_undef();
612
613 if (nir_src_is_if(src))
614 return false;
615
616 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
617 return false;
618
619 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
620 if (intr->intrinsic != nir_intrinsic_store_output ||
621 !nir_src_is_const(intr->src[1])) {
622 return false;
623 }
624
625 uint32_t frac;
626 *dst = ntr_output_decl(c, intr, &frac);
627 dst->Index += ntr_src_as_uint(c, intr->src[1]);
628
629 return frac == 0;
630 }
631
632 /* If this reg is used only for storing an output, then in the simple
633 * cases we can write directly to the TGSI output instead of having
634 * store_output emit its own MOV.
635 */
636 static bool
ntr_try_store_reg_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)637 ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
638 nir_intrinsic_instr *reg_decl)
639 {
640 assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
641
642 *dst = ureg_dst_undef();
643
644 /* Look for a single use for try_store_in_tgsi_output */
645 nir_src *use = NULL;
646 nir_foreach_reg_load(src, reg_decl) {
647 nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
648 nir_foreach_use_including_if(load_use, &load->def) {
649 /* We can only have one use */
650 if (use != NULL)
651 return false;
652
653 use = load_use;
654 }
655 }
656
657 if (use == NULL)
658 return false;
659
660 return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
661 }
662
663 /* If this SSA def is used only for storing an output, then in the simple
664 * cases we can write directly to the TGSI output instead of having
665 * store_output emit its own MOV.
666 */
667 static bool
ntr_try_store_ssa_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_def * def)668 ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
669 nir_def *def)
670 {
671 *dst = ureg_dst_undef();
672
673 if (!list_is_singular(&def->uses))
674 return false;
675
676 nir_foreach_use_including_if(use, def) {
677 return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
678 }
679 unreachable("We have one use");
680 }
681
682 static void
ntr_setup_inputs(struct ntr_compile * c)683 ntr_setup_inputs(struct ntr_compile *c)
684 {
685 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
686 return;
687
688 unsigned num_inputs = 0;
689 int num_input_arrays = 0;
690
691 nir_foreach_shader_in_variable(var, c->s) {
692 const struct glsl_type *type = var->type;
693 unsigned array_len =
694 glsl_count_attribute_slots(type, false);
695
696 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
697 }
698
699 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
700
701 nir_foreach_shader_in_variable(var, c->s) {
702 const struct glsl_type *type = var->type;
703 unsigned array_len =
704 glsl_count_attribute_slots(type, false);
705
706 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
707 unsigned sample_loc;
708 struct ureg_src decl;
709
710 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
711 interpolation =
712 tgsi_get_interp_mode(var->data.interpolation,
713 var->data.location == VARYING_SLOT_COL0 ||
714 var->data.location == VARYING_SLOT_COL1);
715
716 if (var->data.location == VARYING_SLOT_POS)
717 interpolation = TGSI_INTERPOLATE_LINEAR;
718 }
719
720 unsigned semantic_name, semantic_index;
721 tgsi_get_gl_varying_semantic(var->data.location, true,
722 &semantic_name, &semantic_index);
723
724 if (var->data.sample) {
725 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
726 } else if (var->data.centroid) {
727 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
728 c->centroid_inputs |= (BITSET_MASK(array_len) <<
729 var->data.driver_location);
730 } else {
731 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
732 }
733
734 unsigned array_id = 0;
735 if (glsl_type_is_array(type))
736 array_id = ++num_input_arrays;
737
738 uint32_t usage_mask = ntr_tgsi_var_usage_mask(var);
739
740 decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
741 semantic_name,
742 semantic_index,
743 interpolation,
744 sample_loc,
745 var->data.driver_location,
746 usage_mask,
747 array_id, array_len);
748
749 if (semantic_name == TGSI_SEMANTIC_FACE) {
750 struct ureg_dst temp = ntr_temp(c);
751 /* tgsi docs say that floating point FACE will be positive for
752 * frontface and negative for backface, but realistically
753 * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
754 * Copy that behavior, since some drivers (r300) have been doing a
755 * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
756 * front face).
757 */
758 temp.Saturate = true;
759 ntr_MOV(c, temp, decl);
760 decl = ureg_src(temp);
761 }
762
763 for (unsigned i = 0; i < array_len; i++) {
764 c->input_index_map[var->data.driver_location + i] = decl;
765 c->input_index_map[var->data.driver_location + i].Index += i;
766 }
767 }
768 }
769
770 static int
ntr_sort_by_location(const nir_variable * a,const nir_variable * b)771 ntr_sort_by_location(const nir_variable *a, const nir_variable *b)
772 {
773 return a->data.location - b->data.location;
774 }
775
776 /**
777 * Workaround for virglrenderer requiring that TGSI FS output color variables
778 * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
779 */
780 static void
ntr_setup_outputs(struct ntr_compile * c)781 ntr_setup_outputs(struct ntr_compile *c)
782 {
783 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
784 return;
785
786 nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out);
787
788 nir_foreach_shader_out_variable(var, c->s) {
789 if (var->data.location == FRAG_RESULT_COLOR)
790 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
791
792 unsigned semantic_name, semantic_index;
793 tgsi_get_gl_frag_result_semantic(var->data.location,
794 &semantic_name, &semantic_index);
795
796 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
797 }
798 }
799
800 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)801 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
802 {
803 switch (dim) {
804 case GLSL_SAMPLER_DIM_1D:
805 if (is_shadow)
806 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
807 else
808 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
809 case GLSL_SAMPLER_DIM_2D:
810 case GLSL_SAMPLER_DIM_EXTERNAL:
811 if (is_shadow)
812 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
813 else
814 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
815 case GLSL_SAMPLER_DIM_3D:
816 return TGSI_TEXTURE_3D;
817 case GLSL_SAMPLER_DIM_CUBE:
818 if (is_shadow)
819 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
820 else
821 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
822 case GLSL_SAMPLER_DIM_RECT:
823 if (is_shadow)
824 return TGSI_TEXTURE_SHADOWRECT;
825 else
826 return TGSI_TEXTURE_RECT;
827 case GLSL_SAMPLER_DIM_MS:
828 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
829 case GLSL_SAMPLER_DIM_BUF:
830 return TGSI_TEXTURE_BUFFER;
831 default:
832 unreachable("unknown sampler dim");
833 }
834 }
835
836 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)837 tgsi_return_type_from_base_type(enum glsl_base_type type)
838 {
839 switch (type) {
840 case GLSL_TYPE_INT:
841 return TGSI_RETURN_TYPE_SINT;
842 case GLSL_TYPE_UINT:
843 return TGSI_RETURN_TYPE_UINT;
844 case GLSL_TYPE_FLOAT:
845 return TGSI_RETURN_TYPE_FLOAT;
846 default:
847 unreachable("unexpected texture type");
848 }
849 }
850
851 static void
ntr_setup_uniforms(struct ntr_compile * c)852 ntr_setup_uniforms(struct ntr_compile *c)
853 {
854 nir_foreach_uniform_variable(var, c->s) {
855 if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
856 glsl_type_is_texture(glsl_without_array(var->type))) {
857 /* Don't use this size for the check for samplers -- arrays of structs
858 * containing samplers should be ignored, and just the separate lowered
859 * sampler uniform decl used.
860 */
861 int size = glsl_type_get_sampler_count(var->type) +
862 glsl_type_get_texture_count(var->type);
863
864 const struct glsl_type *stype = glsl_without_array(var->type);
865 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
866 glsl_sampler_type_is_array(stype),
867 glsl_sampler_type_is_shadow(stype));
868 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
869 for (int i = 0; i < size; i++) {
870 ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
871 target, ret_type, ret_type, ret_type, ret_type);
872 ureg_DECL_sampler(c->ureg, var->data.binding + i);
873 }
874
875 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
876 * size declaration happens with other UBOs below.
877 */
878 }
879 }
880
881 c->first_ubo = ~0;
882
883 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
884 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
885 int ubo = var->data.driver_location;
886 if (ubo == -1)
887 continue;
888
889 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
890 c->first_ubo = MIN2(c->first_ubo, ubo);
891
892 unsigned size = glsl_get_explicit_size(var->interface_type, false);
893 ubo_sizes[ubo] = size;
894 }
895
896 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
897 if (ubo_sizes[i])
898 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
899 }
900 }
901
902 static void
ntr_setup_registers(struct ntr_compile * c)903 ntr_setup_registers(struct ntr_compile *c)
904 {
905 assert(c->num_temps == 0);
906
907 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
908 /* Permanently allocate all the array regs at the start. */
909 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
910 unsigned index = nir_reg->def.index;
911
912 if (num_array_elems != 0) {
913 struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
914 c->reg_temp[index] = decl;
915 assert(c->num_temps == decl.Index);
916 c->num_temps += num_array_elems;
917 }
918 }
919 c->first_non_array_temp = c->num_temps;
920
921 /* After that, allocate non-array regs in our virtual space that we'll
922 * register-allocate before ureg emit.
923 */
924 nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
925 unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
926 unsigned num_components = nir_intrinsic_num_components(nir_reg);
927 unsigned index = nir_reg->def.index;
928
929 /* We already handled arrays */
930 if (num_array_elems == 0) {
931 struct ureg_dst decl;
932 uint32_t write_mask = BITFIELD_MASK(num_components);
933
934 if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
935 decl = ureg_writemask(ntr_temp(c), write_mask);
936 }
937 c->reg_temp[index] = decl;
938 }
939 }
940 }
941
942 static struct ureg_src
ntr_get_load_const_src(struct ntr_compile * c,nir_load_const_instr * instr)943 ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr)
944 {
945 int num_components = instr->def.num_components;
946
947 float values[4];
948 assert(instr->def.bit_size == 32);
949 for (int i = 0; i < num_components; i++)
950 values[i] = uif(instr->value[i].u32);
951
952 return ureg_DECL_immediate(c->ureg, values, num_components);
953 }
954
955 static struct ureg_src
ntr_reladdr(struct ntr_compile * c,struct ureg_src addr,int addr_index)956 ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index)
957 {
958 assert(addr_index < ARRAY_SIZE(c->addr_reg));
959
960 for (int i = 0; i <= addr_index; i++) {
961 if (!c->addr_declared[i]) {
962 c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
963 TGSI_WRITEMASK_X);
964 c->addr_declared[i] = true;
965 }
966 }
967
968 ntr_ARL(c, c->addr_reg[addr_index], addr);
969 return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
970 }
971
972 /* Forward declare for recursion with indirects */
973 static struct ureg_src
974 ntr_get_src(struct ntr_compile *c, nir_src src);
975
976 static struct ureg_src
ntr_get_chased_src(struct ntr_compile * c,nir_legacy_src * src)977 ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src)
978 {
979 if (src->is_ssa) {
980 if (src->ssa->parent_instr->type == nir_instr_type_load_const)
981 return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
982
983 return c->ssa_temp[src->ssa->index];
984 } else {
985 struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
986 reg_temp.Index += src->reg.base_offset;
987
988 if (src->reg.indirect) {
989 struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect));
990 return ureg_src_indirect(ureg_src(reg_temp),
991 ntr_reladdr(c, offset, 0));
992 } else {
993 return ureg_src(reg_temp);
994 }
995 }
996 }
997
998 static struct ureg_src
ntr_get_src(struct ntr_compile * c,nir_src src)999 ntr_get_src(struct ntr_compile *c, nir_src src)
1000 {
1001 nir_legacy_src chased = nir_legacy_chase_src(&src);
1002 return ntr_get_chased_src(c, &chased);
1003 }
1004
1005 static struct ureg_src
ntr_get_alu_src(struct ntr_compile * c,nir_alu_instr * instr,int i)1006 ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i)
1007 {
1008 /* We only support 32-bit float modifiers. The only other modifier type
1009 * officially supported by TGSI is 32-bit integer negates, but even those are
1010 * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1011 *
1012 * The options->lower_fabs requests that we not have native source modifiers
1013 * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1014 */
1015 nir_legacy_alu_src src =
1016 nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1017 struct ureg_src usrc = ntr_get_chased_src(c, &src.src);
1018
1019 usrc = ureg_swizzle(usrc,
1020 src.swizzle[0],
1021 src.swizzle[1],
1022 src.swizzle[2],
1023 src.swizzle[3]);
1024
1025 if (src.fabs)
1026 usrc = ureg_abs(usrc);
1027 if (src.fneg)
1028 usrc = ureg_negate(usrc);
1029
1030 return usrc;
1031 }
1032
1033 /* Reswizzles a source so that the unset channels in the write mask still refer
1034 * to one of the channels present in the write mask.
1035 */
1036 static struct ureg_src
ntr_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1037 ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1038 {
1039 assert(write_mask);
1040 int first_chan = ffs(write_mask) - 1;
1041 return ureg_swizzle(src,
1042 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1043 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1044 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1045 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1046 }
1047
1048 static struct ureg_dst
ntr_get_ssa_def_decl(struct ntr_compile * c,nir_def * ssa)1049 ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa)
1050 {
1051 uint32_t writemask;
1052 /* Fix writemask for nir_intrinsic_load_ubo_vec4 accoring to uses. */
1053 if (ssa->parent_instr->type == nir_instr_type_intrinsic &&
1054 nir_instr_as_intrinsic(ssa->parent_instr)->intrinsic == nir_intrinsic_load_ubo_vec4)
1055 writemask = nir_def_components_read(ssa);
1056 else
1057 writemask = BITSET_MASK(ssa->num_components);
1058
1059 struct ureg_dst dst;
1060 if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1061 dst = ntr_temp(c);
1062
1063 c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask);
1064
1065 return ureg_writemask(dst, writemask);
1066 }
1067
1068 static struct ureg_dst
ntr_get_chased_dest_decl(struct ntr_compile * c,nir_legacy_dest * dest)1069 ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest)
1070 {
1071 if (dest->is_ssa)
1072 return ntr_get_ssa_def_decl(c, dest->ssa);
1073 else
1074 return c->reg_temp[dest->reg.handle->index];
1075 }
1076
1077 static struct ureg_dst
ntr_get_chased_dest(struct ntr_compile * c,nir_legacy_dest * dest)1078 ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest)
1079 {
1080 struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest);
1081
1082 if (!dest->is_ssa) {
1083 dst.Index += dest->reg.base_offset;
1084
1085 if (dest->reg.indirect) {
1086 struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1087 dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0));
1088 }
1089 }
1090
1091 return dst;
1092 }
1093
1094 static struct ureg_dst
ntr_get_dest(struct ntr_compile * c,nir_def * def)1095 ntr_get_dest(struct ntr_compile *c, nir_def *def)
1096 {
1097 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1098 return ntr_get_chased_dest(c, &chased);
1099 }
1100
1101 static struct ureg_dst
ntr_get_alu_dest(struct ntr_compile * c,nir_def * def)1102 ntr_get_alu_dest(struct ntr_compile *c, nir_def *def)
1103 {
1104 nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1105 struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest);
1106
1107 if (chased.fsat)
1108 dst.Saturate = true;
1109
1110 /* Only registers get write masks */
1111 if (chased.dest.is_ssa)
1112 return dst;
1113
1114 return ureg_writemask(dst, chased.write_mask);
1115 }
1116
1117 /* For an SSA dest being populated by a constant src, replace the storage with
1118 * a copy of the ureg_src.
1119 */
1120 static void
ntr_store_def(struct ntr_compile * c,nir_def * def,struct ureg_src src)1121 ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src)
1122 {
1123 if (!src.Indirect && !src.DimIndirect) {
1124 switch (src.File) {
1125 case TGSI_FILE_IMMEDIATE:
1126 case TGSI_FILE_INPUT:
1127 case TGSI_FILE_CONSTANT:
1128 case TGSI_FILE_SYSTEM_VALUE:
1129 c->ssa_temp[def->index] = src;
1130 return;
1131 }
1132 }
1133
1134 ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src);
1135 }
1136
1137 static void
ntr_store(struct ntr_compile * c,nir_def * def,struct ureg_src src)1138 ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src)
1139 {
1140 nir_legacy_dest chased = nir_legacy_chase_dest(def);
1141
1142 if (chased.is_ssa)
1143 ntr_store_def(c, chased.ssa, src);
1144 else {
1145 struct ureg_dst dst = ntr_get_chased_dest(c, &chased);
1146 ntr_MOV(c, dst, src);
1147 }
1148 }
1149
1150 static void
ntr_emit_scalar(struct ntr_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1151 ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op,
1152 struct ureg_dst dst,
1153 struct ureg_src src0,
1154 struct ureg_src src1)
1155 {
1156 unsigned i;
1157
1158 /* POW is the only 2-operand scalar op. */
1159 if (tgsi_op != TGSI_OPCODE_POW)
1160 src1 = src0;
1161
1162 for (i = 0; i < 4; i++) {
1163 if (dst.WriteMask & (1 << i)) {
1164 ntr_insn(c, tgsi_op,
1165 ureg_writemask(dst, 1 << i),
1166 ureg_scalar(src0, i),
1167 ureg_scalar(src1, i),
1168 ureg_src_undef(), ureg_src_undef());
1169 }
1170 }
1171 }
1172
1173 static void
ntr_emit_alu(struct ntr_compile * c,nir_alu_instr * instr)1174 ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr)
1175 {
1176 struct ureg_src src[4];
1177 struct ureg_dst dst;
1178 unsigned i;
1179 int num_srcs = nir_op_infos[instr->op].num_inputs;
1180
1181 /* Don't try to translate folded fsat since their source won't be valid */
1182 if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1183 return;
1184
1185 c->precise = instr->exact;
1186
1187 assert(num_srcs <= ARRAY_SIZE(src));
1188 for (i = 0; i < num_srcs; i++)
1189 src[i] = ntr_get_alu_src(c, instr, i);
1190 for (; i < ARRAY_SIZE(src); i++)
1191 src[i] = ureg_src_undef();
1192
1193 dst = ntr_get_alu_dest(c, &instr->def);
1194
1195 static enum tgsi_opcode op_map[] = {
1196 [nir_op_mov] = TGSI_OPCODE_MOV,
1197
1198 [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2,
1199 [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3,
1200 [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4,
1201 [nir_op_ffloor] = TGSI_OPCODE_FLR,
1202 [nir_op_ffract] = TGSI_OPCODE_FRC,
1203 [nir_op_fceil] = TGSI_OPCODE_CEIL,
1204 [nir_op_fround_even] = TGSI_OPCODE_ROUND,
1205
1206 [nir_op_slt] = TGSI_OPCODE_SLT,
1207 [nir_op_sge] = TGSI_OPCODE_SGE,
1208 [nir_op_seq] = TGSI_OPCODE_SEQ,
1209 [nir_op_sne] = TGSI_OPCODE_SNE,
1210
1211 [nir_op_ftrunc] = TGSI_OPCODE_TRUNC,
1212 [nir_op_fddx] = TGSI_OPCODE_DDX,
1213 [nir_op_fddy] = TGSI_OPCODE_DDY,
1214 [nir_op_fddx_coarse] = TGSI_OPCODE_DDX,
1215 [nir_op_fddy_coarse] = TGSI_OPCODE_DDY,
1216 [nir_op_fadd] = TGSI_OPCODE_ADD,
1217 [nir_op_fmul] = TGSI_OPCODE_MUL,
1218
1219 [nir_op_fmin] = TGSI_OPCODE_MIN,
1220 [nir_op_fmax] = TGSI_OPCODE_MAX,
1221 [nir_op_ffma] = TGSI_OPCODE_MAD,
1222 };
1223
1224 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) {
1225 /* The normal path for NIR to TGSI ALU op translation */
1226 ntr_insn(c, op_map[instr->op],
1227 dst, src[0], src[1], src[2], src[3]);
1228 } else {
1229 /* Special cases for NIR to TGSI ALU op translation. */
1230
1231 /* TODO: Use something like the ntr_store() path for the MOV calls so we
1232 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1233 */
1234
1235 switch (instr->op) {
1236 case nir_op_fabs:
1237 /* Try to eliminate */
1238 if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1239 break;
1240
1241 if (c->options->lower_fabs)
1242 ntr_MAX(c, dst, src[0], ureg_negate(src[0]));
1243 else
1244 ntr_MOV(c, dst, ureg_abs(src[0]));
1245 break;
1246
1247 case nir_op_fsat:
1248 ntr_MOV(c, ureg_saturate(dst), src[0]);
1249 break;
1250
1251 case nir_op_fneg:
1252 /* Try to eliminate */
1253 if (nir_legacy_float_mod_folds(instr))
1254 break;
1255
1256 ntr_MOV(c, dst, ureg_negate(src[0]));
1257 break;
1258
1259 /* NOTE: TGSI 32-bit math ops have the old "one source channel
1260 * replicated to all dst channels" behavior, while 64 is normal mapping
1261 * of src channels to dst.
1262 */
1263 case nir_op_frcp:
1264 ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1265 break;
1266
1267 case nir_op_frsq:
1268 ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1269 break;
1270
1271 case nir_op_fexp2:
1272 ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1273 break;
1274
1275 case nir_op_flog2:
1276 ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1277 break;
1278
1279 case nir_op_fsin:
1280 ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1281 break;
1282
1283 case nir_op_fcos:
1284 ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1285 break;
1286
1287 case nir_op_fsub:
1288 ntr_ADD(c, dst, src[0], ureg_negate(src[1]));
1289 break;
1290
1291 case nir_op_fmod:
1292 unreachable("should be handled by .lower_fmod = true");
1293 break;
1294
1295 case nir_op_fpow:
1296 ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1297 break;
1298
1299 case nir_op_flrp:
1300 ntr_LRP(c, dst, src[2], src[1], src[0]);
1301 break;
1302
1303 case nir_op_fcsel:
1304 /* If CMP isn't supported, then the flags that enable NIR to generate
1305 * this opcode should also not be set.
1306 */
1307 assert(!c->options->lower_cmp);
1308
1309 /* Implement this as CMP(-abs(src0), src1, src2). */
1310 ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1311 break;
1312
1313 case nir_op_fcsel_gt:
1314 /* If CMP isn't supported, then the flags that enable NIR to generate
1315 * these opcodes should also not be set.
1316 */
1317 assert(!c->options->lower_cmp);
1318
1319 ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1320 break;
1321
1322 case nir_op_fcsel_ge:
1323 /* If CMP isn't supported, then the flags that enable NIR to generate
1324 * these opcodes should also not be set.
1325 */
1326 assert(!c->options->lower_cmp);
1327
1328 /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1329 ntr_CMP(c, dst, src[0], src[2], src[1]);
1330 break;
1331
1332 case nir_op_vec4:
1333 case nir_op_vec3:
1334 case nir_op_vec2:
1335 unreachable("covered by nir_lower_vec_to_movs()");
1336
1337 default:
1338 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1339 unreachable("Unknown NIR opcode");
1340 }
1341 }
1342
1343 c->precise = false;
1344 }
1345
1346 static struct ureg_src
ntr_ureg_src_indirect(struct ntr_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1347 ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc,
1348 nir_src src, int addr_reg)
1349 {
1350 if (nir_src_is_const(src)) {
1351 usrc.Index += ntr_src_as_uint(c, src);
1352 return usrc;
1353 } else {
1354 return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg));
1355 }
1356 }
1357
1358 static struct ureg_dst
ntr_ureg_dst_indirect(struct ntr_compile * c,struct ureg_dst dst,nir_src src)1359 ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst,
1360 nir_src src)
1361 {
1362 if (nir_src_is_const(src)) {
1363 dst.Index += ntr_src_as_uint(c, src);
1364 return dst;
1365 } else {
1366 return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0));
1367 }
1368 }
1369
1370 static struct ureg_dst
ntr_ureg_dst_dimension_indirect(struct ntr_compile * c,struct ureg_dst udst,nir_src src)1371 ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst,
1372 nir_src src)
1373 {
1374 if (nir_src_is_const(src)) {
1375 return ureg_dst_dimension(udst, ntr_src_as_uint(c, src));
1376 } else {
1377 return ureg_dst_dimension_indirect(udst,
1378 ntr_reladdr(c, ntr_get_src(c, src), 1),
1379 0);
1380 }
1381 }
1382 /* Some load operations in NIR will have a fractional offset that we need to
1383 * swizzle down before storing to the result register.
1384 */
1385 static struct ureg_src
ntr_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1386 ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1387 {
1388 return ureg_swizzle(src,
1389 frac,
1390 frac + MIN2(num_components - 1, 1),
1391 frac + MIN2(num_components - 1, 2),
1392 frac + MIN2(num_components - 1, 3));
1393 }
1394
1395
1396 static void
ntr_emit_load_ubo(struct ntr_compile * c,nir_intrinsic_instr * instr)1397 ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr)
1398 {
1399 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1400
1401 struct ureg_dst addr_temp = ureg_dst_undef();
1402
1403 if (nir_src_is_const(instr->src[0])) {
1404 src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0]));
1405 } else {
1406 /* virglrenderer requires that indirect UBO references have the UBO
1407 * array's base index in the Index field, not added to the indrect
1408 * address.
1409 *
1410 * Many nir intrinsics have a base address const value for the start of
1411 * their array indirection, but load_ubo doesn't. We fake it by
1412 * subtracting it off here.
1413 */
1414 addr_temp = ntr_temp(c);
1415 ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1416 src = ureg_src_dimension_indirect(src,
1417 ntr_reladdr(c, ureg_src(addr_temp), 1),
1418 c->first_ubo);
1419 }
1420
1421 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1422 * file.
1423 */
1424 src.Index = nir_intrinsic_base(instr);
1425
1426 if (nir_src_is_const(instr->src[1])) {
1427 src.Index += ntr_src_as_uint(c, instr->src[1]);
1428 } else {
1429 src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0));
1430 }
1431
1432 int start_component = nir_intrinsic_component(instr);
1433
1434 src = ntr_shift_by_frac(src, start_component, instr->num_components);
1435
1436 ntr_store(c, &instr->def, src);
1437 }
1438
1439 static void
ntr_emit_load_input(struct ntr_compile * c,nir_intrinsic_instr * instr)1440 ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr)
1441 {
1442 uint32_t frac = nir_intrinsic_component(instr);
1443 uint32_t num_components = instr->num_components;
1444 unsigned base = nir_intrinsic_base(instr);
1445 struct ureg_src input;
1446 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1447
1448 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1449 input = ureg_DECL_vs_input(c->ureg, base);
1450 for (int i = 1; i < semantics.num_slots; i++)
1451 ureg_DECL_vs_input(c->ureg, base + i);
1452 } else {
1453 input = c->input_index_map[base];
1454 }
1455
1456 input = ntr_shift_by_frac(input, frac, num_components);
1457
1458 switch (instr->intrinsic) {
1459 case nir_intrinsic_load_input:
1460 input = ntr_ureg_src_indirect(c, input, instr->src[0], 0);
1461 ntr_store(c, &instr->def, input);
1462 break;
1463
1464 case nir_intrinsic_load_interpolated_input: {
1465 input = ntr_ureg_src_indirect(c, input, instr->src[1], 0);
1466
1467 nir_intrinsic_instr *bary_instr =
1468 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1469
1470 switch (bary_instr->intrinsic) {
1471 case nir_intrinsic_load_barycentric_pixel:
1472 case nir_intrinsic_load_barycentric_sample:
1473 /* For these, we know that the barycentric load matches the
1474 * interpolation on the input declaration, so we can use it directly.
1475 */
1476 ntr_store(c, &instr->def, input);
1477 break;
1478
1479 case nir_intrinsic_load_barycentric_centroid:
1480 /* If the input was declared centroid, then there's no need to
1481 * emit the extra TGSI interp instruction, we can just read the
1482 * input.
1483 */
1484 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1485 ntr_store(c, &instr->def, input);
1486 } else {
1487 ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input);
1488 }
1489 break;
1490
1491 case nir_intrinsic_load_barycentric_at_sample:
1492 /* We stored the sample in the fake "bary" dest. */
1493 ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input,
1494 ntr_get_src(c, instr->src[0]));
1495 break;
1496
1497 case nir_intrinsic_load_barycentric_at_offset:
1498 /* We stored the offset in the fake "bary" dest. */
1499 ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input,
1500 ntr_get_src(c, instr->src[0]));
1501 break;
1502
1503 default:
1504 unreachable("bad barycentric interp intrinsic\n");
1505 }
1506 break;
1507 }
1508
1509 default:
1510 unreachable("bad load input intrinsic\n");
1511 }
1512 }
1513
1514 static void
ntr_emit_store_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1515 ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1516 {
1517 struct ureg_src src = ntr_get_src(c, instr->src[0]);
1518
1519 if (src.File == TGSI_FILE_OUTPUT) {
1520 /* If our src is the output file, that's an indication that we were able
1521 * to emit the output stores in the generating instructions and we have
1522 * nothing to do here.
1523 */
1524 return;
1525 }
1526
1527 uint32_t frac;
1528 struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1529
1530 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1531 out = ntr_ureg_dst_indirect(c, out, instr->src[2]);
1532 out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1533 } else {
1534 out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1535 }
1536
1537 uint8_t swizzle[4] = { 0, 0, 0, 0 };
1538 for (int i = frac; i < 4; i++) {
1539 if (out.WriteMask & (1 << i))
1540 swizzle[i] = i - frac;
1541 }
1542
1543 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1544
1545 ntr_MOV(c, out, src);
1546 }
1547
1548 static void
ntr_emit_load_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1549 ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1550 {
1551 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1552
1553 /* ntr_try_store_in_tgsi_output() optimization is not valid if normal
1554 * load_output is present.
1555 */
1556 assert(c->s->info.stage != MESA_SHADER_VERTEX &&
1557 (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
1558
1559 uint32_t frac;
1560 struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1561
1562 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
1563 out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1564 out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]);
1565 } else {
1566 out = ntr_ureg_dst_indirect(c, out, instr->src[0]);
1567 }
1568
1569 struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1570 struct ureg_src out_src = ureg_src(out);
1571
1572 /* Don't swizzling unavailable channels of the output in the writemasked-out
1573 * components. Avoids compile failures in virglrenderer with
1574 * TESS_LEVEL_INNER.
1575 */
1576 int fill_channel = ffs(dst.WriteMask) - 1;
1577 uint8_t swizzles[4] = { 0, 1, 2, 3 };
1578 for (int i = 0; i < 4; i++)
1579 if (!(dst.WriteMask & (1 << i)))
1580 swizzles[i] = fill_channel;
1581 out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
1582
1583 if (semantics.fb_fetch_output)
1584 ntr_FBFETCH(c, dst, out_src);
1585 else
1586 ntr_MOV(c, dst, out_src);
1587 }
1588
1589 static void
ntr_emit_load_sysval(struct ntr_compile * c,nir_intrinsic_instr * instr)1590 ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr)
1591 {
1592 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1593 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1594 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1595
1596 /* virglrenderer doesn't like references to channels of the sysval that
1597 * aren't defined, even if they aren't really read. (GLSL compile fails on
1598 * gl_NumWorkGroups.w, for example).
1599 */
1600 uint32_t write_mask = BITSET_MASK(instr->def.num_components);
1601 sv = ntr_swizzle_for_write_mask(sv, write_mask);
1602
1603 /* TGSI and NIR define these intrinsics as always loading ints, but they can
1604 * still appear on hardware with non-native-integers fragment shaders using
1605 * the draw path (i915g). In that case, having called nir_lower_int_to_float
1606 * means that we actually want floats instead.
1607 */
1608 switch (instr->intrinsic) {
1609 case nir_intrinsic_load_vertex_id:
1610 case nir_intrinsic_load_instance_id:
1611 ntr_U2F(c, ntr_get_dest(c, &instr->def), sv);
1612 return;
1613
1614 default:
1615 break;
1616 }
1617
1618 ntr_store(c, &instr->def, sv);
1619 }
1620
1621 static void
ntr_emit_intrinsic(struct ntr_compile * c,nir_intrinsic_instr * instr)1622 ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr)
1623 {
1624 switch (instr->intrinsic) {
1625 case nir_intrinsic_load_ubo:
1626 case nir_intrinsic_load_ubo_vec4:
1627 ntr_emit_load_ubo(c, instr);
1628 break;
1629
1630 /* Vertex */
1631 case nir_intrinsic_load_draw_id:
1632 case nir_intrinsic_load_invocation_id:
1633 case nir_intrinsic_load_frag_coord:
1634 case nir_intrinsic_load_point_coord:
1635 case nir_intrinsic_load_front_face:
1636 ntr_emit_load_sysval(c, instr);
1637 break;
1638
1639 case nir_intrinsic_load_input:
1640 case nir_intrinsic_load_per_vertex_input:
1641 case nir_intrinsic_load_interpolated_input:
1642 ntr_emit_load_input(c, instr);
1643 break;
1644
1645 case nir_intrinsic_store_output:
1646 case nir_intrinsic_store_per_vertex_output:
1647 ntr_emit_store_output(c, instr);
1648 break;
1649
1650 case nir_intrinsic_load_output:
1651 case nir_intrinsic_load_per_vertex_output:
1652 ntr_emit_load_output(c, instr);
1653 break;
1654
1655 case nir_intrinsic_discard:
1656 ntr_KILL(c);
1657 break;
1658
1659 case nir_intrinsic_discard_if: {
1660 struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0);
1661 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1662 ntr_KILL_IF(c, ureg_negate(cond));
1663 break;
1664 }
1665 /* In TGSI we don't actually generate the barycentric coords, and emit
1666 * interp intrinsics later. However, we do need to store the
1667 * load_barycentric_at_* argument so that we can use it at that point.
1668 */
1669 case nir_intrinsic_load_barycentric_pixel:
1670 case nir_intrinsic_load_barycentric_centroid:
1671 case nir_intrinsic_load_barycentric_sample:
1672 break;
1673 case nir_intrinsic_load_barycentric_at_sample:
1674 case nir_intrinsic_load_barycentric_at_offset:
1675 ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0]));
1676 break;
1677
1678 case nir_intrinsic_decl_reg:
1679 case nir_intrinsic_load_reg:
1680 case nir_intrinsic_load_reg_indirect:
1681 case nir_intrinsic_store_reg:
1682 case nir_intrinsic_store_reg_indirect:
1683 /* fully consumed */
1684 break;
1685
1686 default:
1687 fprintf(stderr, "Unknown intrinsic: ");
1688 nir_print_instr(&instr->instr, stderr);
1689 fprintf(stderr, "\n");
1690 break;
1691 }
1692 }
1693
1694 struct ntr_tex_operand_state {
1695 struct ureg_src srcs[4];
1696 unsigned i;
1697 };
1698
1699 static void
ntr_push_tex_arg(struct ntr_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_tex_operand_state * s)1700 ntr_push_tex_arg(struct ntr_compile *c,
1701 nir_tex_instr *instr,
1702 nir_tex_src_type tex_src_type,
1703 struct ntr_tex_operand_state *s)
1704 {
1705 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1706 if (tex_src < 0)
1707 return;
1708
1709 nir_src *src = &instr->src[tex_src].src;
1710 s->srcs[s->i++] = ntr_get_src(c, *src);
1711 }
1712
1713 static void
ntr_emit_texture(struct ntr_compile * c,nir_tex_instr * instr)1714 ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr)
1715 {
1716 struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1717 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
1718 unsigned tex_opcode;
1719
1720 int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
1721 int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
1722
1723 struct ureg_src sampler;
1724 if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
1725 /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
1726 sampler = ntr_get_src(c, instr->src[tex_handle_src].src);
1727 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
1728 } else {
1729 assert(tex_handle_src == -1 && sampler_handle_src == -1);
1730 sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1731 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1732 if (sampler_src >= 0) {
1733 struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src);
1734 sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2));
1735 }
1736 }
1737
1738 switch (instr->op) {
1739 case nir_texop_tex:
1740 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
1741 MAX2(instr->coord_components, 2) + instr->is_shadow)
1742 tex_opcode = TGSI_OPCODE_TXP;
1743 else
1744 tex_opcode = TGSI_OPCODE_TEX;
1745 break;
1746 case nir_texop_txl:
1747 tex_opcode = TGSI_OPCODE_TXL;
1748 break;
1749 case nir_texop_txb:
1750 tex_opcode = TGSI_OPCODE_TXB;
1751 break;
1752 case nir_texop_txd:
1753 tex_opcode = TGSI_OPCODE_TXD;
1754 break;
1755 case nir_texop_txs:
1756 tex_opcode = TGSI_OPCODE_TXQ;
1757 break;
1758 case nir_texop_tg4:
1759 tex_opcode = TGSI_OPCODE_TG4;
1760 break;
1761 case nir_texop_query_levels:
1762 tex_opcode = TGSI_OPCODE_TXQ;
1763 break;
1764 case nir_texop_lod:
1765 tex_opcode = TGSI_OPCODE_LODQ;
1766 break;
1767 case nir_texop_texture_samples:
1768 tex_opcode = TGSI_OPCODE_TXQS;
1769 break;
1770 default:
1771 unreachable("unsupported tex op");
1772 }
1773
1774 struct ntr_tex_operand_state s = { .i = 0 };
1775 ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
1776 ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
1777
1778 /* non-coord arg for TXQ */
1779 if (tex_opcode == TGSI_OPCODE_TXQ) {
1780 ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1781 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
1782 * scalar
1783 */
1784 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
1785 }
1786
1787 if (s.i > 1) {
1788 if (tex_opcode == TGSI_OPCODE_TEX)
1789 tex_opcode = TGSI_OPCODE_TEX2;
1790 if (tex_opcode == TGSI_OPCODE_TXB)
1791 tex_opcode = TGSI_OPCODE_TXB2;
1792 if (tex_opcode == TGSI_OPCODE_TXL)
1793 tex_opcode = TGSI_OPCODE_TXL2;
1794 }
1795
1796 if (instr->op == nir_texop_txd) {
1797 /* Derivs appear in their own src args */
1798 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
1799 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1800 s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src);
1801 s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src);
1802 }
1803
1804 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1805 if (c->screen->get_param(c->screen,
1806 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
1807 sampler = ureg_scalar(sampler, instr->component);
1808 s.srcs[s.i++] = ureg_src_undef();
1809 } else {
1810 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
1811 }
1812 }
1813
1814 s.srcs[s.i++] = sampler;
1815
1816 enum tgsi_return_type tex_type;
1817 switch (instr->dest_type) {
1818 case nir_type_float32:
1819 tex_type = TGSI_RETURN_TYPE_FLOAT;
1820 break;
1821 case nir_type_int32:
1822 tex_type = TGSI_RETURN_TYPE_SINT;
1823 break;
1824 case nir_type_uint32:
1825 tex_type = TGSI_RETURN_TYPE_UINT;
1826 break;
1827 default:
1828 unreachable("unknown texture type");
1829 }
1830
1831 struct ureg_dst tex_dst;
1832 if (instr->op == nir_texop_query_levels)
1833 tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W);
1834 else
1835 tex_dst = dst;
1836
1837 while (s.i < 4)
1838 s.srcs[s.i++] = ureg_src_undef();
1839
1840 struct ntr_insn *insn = ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
1841 insn->tex_target = target;
1842 insn->tex_return_type = tex_type;
1843 insn->is_tex = true;
1844
1845 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
1846 if (tex_offset_src >= 0) {
1847 struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src);
1848
1849 insn->tex_offset[0].File = offset.File;
1850 insn->tex_offset[0].Index = offset.Index;
1851 insn->tex_offset[0].SwizzleX = offset.SwizzleX;
1852 insn->tex_offset[0].SwizzleY = offset.SwizzleY;
1853 insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
1854 insn->tex_offset[0].Padding = 0;
1855 }
1856
1857 if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
1858 for (uint8_t i = 0; i < 4; ++i) {
1859 struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
1860 insn->tex_offset[i].File = imm.File;
1861 insn->tex_offset[i].Index = imm.Index;
1862 insn->tex_offset[i].SwizzleX = imm.SwizzleX;
1863 insn->tex_offset[i].SwizzleY = imm.SwizzleY;
1864 insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
1865 }
1866 }
1867
1868 if (instr->op == nir_texop_query_levels)
1869 ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
1870 }
1871
1872 static void
ntr_emit_jump(struct ntr_compile * c,nir_jump_instr * jump)1873 ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump)
1874 {
1875 switch (jump->type) {
1876 case nir_jump_break:
1877 ntr_BRK(c);
1878 break;
1879
1880 case nir_jump_continue:
1881 ntr_CONT(c);
1882 break;
1883
1884 default:
1885 fprintf(stderr, "Unknown jump instruction: ");
1886 nir_print_instr(&jump->instr, stderr);
1887 fprintf(stderr, "\n");
1888 abort();
1889 }
1890 }
1891
1892 static void
ntr_emit_ssa_undef(struct ntr_compile * c,nir_undef_instr * instr)1893 ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr)
1894 {
1895 /* Nothing to do but make sure that we have some storage to deref. */
1896 (void)ntr_get_ssa_def_decl(c, &instr->def);
1897 }
1898
1899 static void
ntr_emit_instr(struct ntr_compile * c,nir_instr * instr)1900 ntr_emit_instr(struct ntr_compile *c, nir_instr *instr)
1901 {
1902 switch (instr->type) {
1903 case nir_instr_type_deref:
1904 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
1905 break;
1906
1907 case nir_instr_type_alu:
1908 ntr_emit_alu(c, nir_instr_as_alu(instr));
1909 break;
1910
1911 case nir_instr_type_intrinsic:
1912 ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1913 break;
1914
1915 case nir_instr_type_load_const:
1916 /* Nothing to do here, as load consts are done directly from
1917 * ntr_get_src() (since many constant NIR srcs will often get folded
1918 * directly into a register file index instead of as a TGSI src).
1919 */
1920 break;
1921
1922 case nir_instr_type_tex:
1923 ntr_emit_texture(c, nir_instr_as_tex(instr));
1924 break;
1925
1926 case nir_instr_type_jump:
1927 ntr_emit_jump(c, nir_instr_as_jump(instr));
1928 break;
1929
1930 case nir_instr_type_undef:
1931 ntr_emit_ssa_undef(c, nir_instr_as_undef(instr));
1932 break;
1933
1934 default:
1935 fprintf(stderr, "Unknown NIR instr type: ");
1936 nir_print_instr(instr, stderr);
1937 fprintf(stderr, "\n");
1938 abort();
1939 }
1940 }
1941
1942 static void
ntr_emit_if(struct ntr_compile * c,nir_if * if_stmt)1943 ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt)
1944 {
1945 ntr_IF(c, c->if_cond);
1946
1947 ntr_emit_cf_list(c, &if_stmt->then_list);
1948
1949 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
1950 ntr_ELSE(c);
1951 ntr_emit_cf_list(c, &if_stmt->else_list);
1952 }
1953
1954 ntr_ENDIF(c);
1955 }
1956
1957 static void
ntr_emit_loop(struct ntr_compile * c,nir_loop * loop)1958 ntr_emit_loop(struct ntr_compile *c, nir_loop *loop)
1959 {
1960 assert(!nir_loop_has_continue_construct(loop));
1961 ntr_BGNLOOP(c);
1962 ntr_emit_cf_list(c, &loop->body);
1963 ntr_ENDLOOP(c);
1964 }
1965
1966 static void
ntr_emit_block(struct ntr_compile * c,nir_block * block)1967 ntr_emit_block(struct ntr_compile *c, nir_block *block)
1968 {
1969 struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
1970 c->cur_block = ntr_block;
1971
1972 nir_foreach_instr(instr, block) {
1973 ntr_emit_instr(c, instr);
1974
1975 /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */
1976 if (ureg_get_instruction_number(c->ureg) != 0) {
1977 fprintf(stderr, "Emitted ureg insn during: ");
1978 nir_print_instr(instr, stderr);
1979 fprintf(stderr, "\n");
1980 unreachable("emitted ureg insn");
1981 }
1982 }
1983
1984 /* Set up the if condition for ntr_emit_if(), which we have to do before
1985 * freeing up the temps (the "if" is treated as inside the block for liveness
1986 * purposes, despite not being an instruction)
1987 *
1988 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
1989 * looks at all of .xyzw. No harm in working around the bug.
1990 */
1991 nir_if *nif = nir_block_get_following_if(block);
1992 if (nif)
1993 c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X);
1994 }
1995
1996 static void
ntr_emit_cf_list(struct ntr_compile * c,struct exec_list * list)1997 ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list)
1998 {
1999 foreach_list_typed(nir_cf_node, node, node, list) {
2000 switch (node->type) {
2001 case nir_cf_node_block:
2002 ntr_emit_block(c, nir_cf_node_as_block(node));
2003 break;
2004
2005 case nir_cf_node_if:
2006 ntr_emit_if(c, nir_cf_node_as_if(node));
2007 break;
2008
2009 case nir_cf_node_loop:
2010 ntr_emit_loop(c, nir_cf_node_as_loop(node));
2011 break;
2012
2013 default:
2014 unreachable("unknown CF type");
2015 }
2016 }
2017 }
2018
2019 static void
ntr_emit_block_ureg(struct ntr_compile * c,struct nir_block * block)2020 ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block)
2021 {
2022 struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
2023
2024 /* Emit the ntr insns to tgsi_ureg. */
2025 util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
2026 const struct tgsi_opcode_info *opcode_info =
2027 tgsi_get_opcode_info(insn->opcode);
2028
2029 switch (insn->opcode) {
2030 case TGSI_OPCODE_IF:
2031 ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2032 break;
2033
2034 case TGSI_OPCODE_ELSE:
2035 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2036 ureg_ELSE(c->ureg, &c->cf_label);
2037 c->current_if_else = c->cf_label;
2038 break;
2039
2040 case TGSI_OPCODE_ENDIF:
2041 ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2042 ureg_ENDIF(c->ureg);
2043 break;
2044
2045 case TGSI_OPCODE_BGNLOOP:
2046 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2047 * does reference BGNLOOP's. Follow the former behavior unless something comes up
2048 * with a need.
2049 */
2050 ureg_BGNLOOP(c->ureg, &c->cf_label);
2051 break;
2052
2053 case TGSI_OPCODE_ENDLOOP:
2054 ureg_ENDLOOP(c->ureg, &c->cf_label);
2055 break;
2056
2057 default:
2058 if (insn->is_tex) {
2059 int num_offsets = 0;
2060 for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
2061 if (insn->tex_offset[i].File != TGSI_FILE_NULL)
2062 num_offsets = i + 1;
2063 }
2064 ureg_tex_insn(c->ureg, insn->opcode,
2065 insn->dst, opcode_info->num_dst,
2066 insn->tex_target, insn->tex_return_type,
2067 insn->tex_offset,
2068 num_offsets,
2069 insn->src, opcode_info->num_src);
2070 } else {
2071 ureg_insn(c->ureg, insn->opcode,
2072 insn->dst, opcode_info->num_dst,
2073 insn->src, opcode_info->num_src,
2074 insn->precise);
2075 }
2076 }
2077 }
2078 }
2079
2080 static void
ntr_emit_if_ureg(struct ntr_compile * c,nir_if * if_stmt)2081 ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt)
2082 {
2083 /* Note: the last block emitted our IF opcode. */
2084
2085 int if_stack = c->current_if_else;
2086 c->current_if_else = c->cf_label;
2087
2088 /* Either the then or else block includes the ENDIF, which will fix up the
2089 * IF(/ELSE)'s label for jumping
2090 */
2091 ntr_emit_cf_list_ureg(c, &if_stmt->then_list);
2092 ntr_emit_cf_list_ureg(c, &if_stmt->else_list);
2093
2094 c->current_if_else = if_stack;
2095 }
2096
2097 static void
ntr_emit_cf_list_ureg(struct ntr_compile * c,struct exec_list * list)2098 ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list)
2099 {
2100 foreach_list_typed(nir_cf_node, node, node, list) {
2101 switch (node->type) {
2102 case nir_cf_node_block:
2103 ntr_emit_block_ureg(c, nir_cf_node_as_block(node));
2104 break;
2105
2106 case nir_cf_node_if:
2107 ntr_emit_if_ureg(c, nir_cf_node_as_if(node));
2108 break;
2109
2110 case nir_cf_node_loop:
2111 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2112 * does reference BGNLOOP's. Follow the former behavior unless something comes up
2113 * with a need.
2114 */
2115 ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
2116 break;
2117
2118 default:
2119 unreachable("unknown CF type");
2120 }
2121 }
2122 }
2123
2124 static void
ntr_emit_impl(struct ntr_compile * c,nir_function_impl * impl)2125 ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl)
2126 {
2127 c->impl = impl;
2128
2129 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
2130 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
2131
2132 /* Set up the struct ntr_blocks to put insns in */
2133 c->blocks = _mesa_pointer_hash_table_create(c);
2134 nir_foreach_block(block, impl) {
2135 struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block);
2136 util_dynarray_init(&ntr_block->insns, ntr_block);
2137 _mesa_hash_table_insert(c->blocks, block, ntr_block);
2138 }
2139
2140
2141 ntr_setup_registers(c);
2142
2143 c->cur_block = ntr_block_from_nir(c, nir_start_block(impl));
2144 ntr_setup_inputs(c);
2145 ntr_setup_outputs(c);
2146 ntr_setup_uniforms(c);
2147
2148 /* Emit the ntr insns */
2149 ntr_emit_cf_list(c, &impl->body);
2150
2151 /* Don't do optimized RA if the driver requests it, unless the number of
2152 * temps is too large to be covered by the 16 bit signed int that TGSI
2153 * allocates for the register index */
2154 if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
2155 ntr_allocate_regs(c, impl);
2156 else
2157 ntr_allocate_regs_unoptimized(c, impl);
2158
2159 /* Turn the ntr insns into actual TGSI tokens */
2160 ntr_emit_cf_list_ureg(c, &impl->body);
2161
2162 ralloc_free(c->liveness);
2163 c->liveness = NULL;
2164
2165 }
2166
2167 static int
type_size(const struct glsl_type * type,bool bindless)2168 type_size(const struct glsl_type *type, bool bindless)
2169 {
2170 return glsl_count_attribute_slots(type, false);
2171 }
2172
2173 /* Allow vectorizing of ALU instructions.
2174 */
2175 static uint8_t
ntr_should_vectorize_instr(const nir_instr * instr,const void * data)2176 ntr_should_vectorize_instr(const nir_instr *instr, const void *data)
2177 {
2178 if (instr->type != nir_instr_type_alu)
2179 return 0;
2180
2181 return 4;
2182 }
2183
2184 static bool
ntr_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2185 ntr_should_vectorize_io(unsigned align, unsigned bit_size,
2186 unsigned num_components, unsigned high_offset,
2187 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
2188 void *data)
2189 {
2190 if (bit_size != 32)
2191 return false;
2192
2193 /* Our offset alignment should aways be at least 4 bytes */
2194 if (align < 4)
2195 return false;
2196
2197 /* No wrapping off the end of a TGSI reg. We could do a bit better by
2198 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
2199 * need this restriction.
2200 */
2201 unsigned worst_start_component = align == 4 ? 3 : align / 4;
2202 if (worst_start_component + num_components > 4)
2203 return false;
2204
2205 return true;
2206 }
2207
2208 static nir_variable_mode
ntr_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2209 ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2210 {
2211 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2212 unsigned indirect_mask = 0;
2213
2214 if (!screen->get_shader_param(screen, pipe_stage,
2215 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2216 indirect_mask |= nir_var_shader_in;
2217 }
2218
2219 if (!screen->get_shader_param(screen, pipe_stage,
2220 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2221 indirect_mask |= nir_var_shader_out;
2222 }
2223
2224 if (!screen->get_shader_param(screen, pipe_stage,
2225 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2226 indirect_mask |= nir_var_function_temp;
2227 }
2228
2229 return indirect_mask;
2230 }
2231
2232 struct ntr_lower_tex_state {
2233 nir_scalar channels[8];
2234 unsigned i;
2235 };
2236
2237 static void
nir_to_rc_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_lower_tex_state * s)2238 nir_to_rc_lower_tex_instr_arg(nir_builder *b,
2239 nir_tex_instr *instr,
2240 nir_tex_src_type tex_src_type,
2241 struct ntr_lower_tex_state *s)
2242 {
2243 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2244 if (tex_src < 0)
2245 return;
2246
2247 nir_def *def = instr->src[tex_src].src.ssa;
2248 for (int i = 0; i < def->num_components; i++) {
2249 s->channels[s->i++] = nir_get_scalar(def, i);
2250 }
2251
2252 nir_tex_instr_remove_src(instr, tex_src);
2253 }
2254
2255 /**
2256 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
2257 * src. This lets NIR handle the coalescing of the vec4 rather than trying to
2258 * manage it on our own, and may lead to more vectorization.
2259 */
2260 static bool
nir_to_rc_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)2261 nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
2262 {
2263 if (instr->type != nir_instr_type_tex)
2264 return false;
2265
2266 nir_tex_instr *tex = nir_instr_as_tex(instr);
2267
2268 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
2269 return false;
2270
2271 b->cursor = nir_before_instr(instr);
2272
2273 struct ntr_lower_tex_state s = {0};
2274
2275 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
2276 /* We always have at least two slots for the coordinate, even on 1D. */
2277 s.i = MAX2(s.i, 2);
2278
2279 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
2280 s.i = MAX2(s.i, 3);
2281
2282 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
2283
2284 /* XXX: LZ */
2285 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
2286 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
2287 nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
2288
2289 /* No need to pack undefs in unused channels of the tex instr */
2290 while (!s.channels[s.i - 1].def)
2291 s.i--;
2292
2293 /* Instead of putting undefs in the unused slots of the vecs, just put in
2294 * another used channel. Otherwise, we'll get unnecessary moves into
2295 * registers.
2296 */
2297 assert(s.channels[0].def != NULL);
2298 for (int i = 1; i < s.i; i++) {
2299 if (!s.channels[i].def)
2300 s.channels[i] = s.channels[0];
2301 }
2302
2303 nir_tex_instr_add_src(tex, nir_tex_src_backend1,
2304 nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
2305 if (s.i > 4)
2306 nir_tex_instr_add_src(tex, nir_tex_src_backend2,
2307 nir_vec_scalars(b, &s.channels[4], s.i - 4));
2308
2309 return true;
2310 }
2311
2312 static bool
nir_to_rc_lower_tex(nir_shader * s)2313 nir_to_rc_lower_tex(nir_shader *s)
2314 {
2315 return nir_shader_instructions_pass(s,
2316 nir_to_rc_lower_tex_instr,
2317 nir_metadata_block_index |
2318 nir_metadata_dominance,
2319 NULL);
2320 }
2321
2322 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
2323 static void
nir_to_rc_lower_txp(nir_shader * s)2324 nir_to_rc_lower_txp(nir_shader *s)
2325 {
2326 nir_lower_tex_options lower_tex_options = {
2327 .lower_txp = 0,
2328 };
2329
2330 nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
2331 nir_foreach_instr(instr, block) {
2332 if (instr->type != nir_instr_type_tex)
2333 continue;
2334 nir_tex_instr *tex = nir_instr_as_tex(instr);
2335
2336 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
2337 continue;
2338
2339 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
2340 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
2341 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
2342
2343 /* We can do TXP for any tex (not txg) where we can fit all the
2344 * coordinates and comparator and projector in one vec4 without any
2345 * other modifiers to add on.
2346 *
2347 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
2348 * if we get any funny projectors then we just blow them all away.
2349 */
2350 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
2351 lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
2352 }
2353 }
2354
2355 /* nir_lower_tex must be run even if no options are set, because we need the
2356 * LOD to be set for query_levels and for non-fragment shaders.
2357 */
2358 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
2359 }
2360
2361 const void *
nir_to_rc(struct nir_shader * s,struct pipe_screen * screen)2362 nir_to_rc(struct nir_shader *s,
2363 struct pipe_screen *screen)
2364 {
2365 static const struct nir_to_rc_options default_ntr_options = {0};
2366 return nir_to_rc_options(s, screen, &default_ntr_options);
2367 }
2368
2369 /**
2370 * Translates the NIR shader to TGSI.
2371 *
2372 * This requires some lowering of the NIR shader to prepare it for translation.
2373 * We take ownership of the NIR shader passed, returning a reference to the new
2374 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
2375 */
nir_to_rc_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_rc_options * options)2376 const void *nir_to_rc_options(struct nir_shader *s,
2377 struct pipe_screen *screen,
2378 const struct nir_to_rc_options *options)
2379 {
2380 struct ntr_compile *c;
2381 const void *tgsi_tokens;
2382 bool is_r500 = r300_screen(screen)->caps.is_r500;
2383 nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen);
2384
2385 /* Lower array indexing on FS inputs. Since we don't set
2386 * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
2387 * elements by ureg, and so dynamically indexing them would be invalid.
2388 * Ideally we would set that ureg flag based on
2389 * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
2390 * splitting NIR VS outputs to elements even if the FS doesn't get the
2391 * corresponding splitting, and virgl depends on TGSI across link boundaries
2392 * having matching declarations.
2393 */
2394 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2395 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
2396 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
2397 }
2398
2399 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2400 type_size, (nir_lower_io_options)0);
2401
2402 nir_to_rc_lower_txp(s);
2403 NIR_PASS_V(s, nir_to_rc_lower_tex);
2404
2405 if (!s->options->lower_uniforms_to_ubo) {
2406 NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
2407 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
2408 true);
2409 }
2410
2411 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
2412 NIR_PASS_V(s, nir_lower_ubo_vec4);
2413
2414 bool progress;
2415 NIR_PASS_V(s, nir_opt_constant_folding);
2416
2417 /* Clean up after triginometric input normalization. */
2418 NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
2419 do {
2420 progress = false;
2421 NIR_PASS(progress, s, nir_opt_shrink_vectors);
2422 } while (progress);
2423 NIR_PASS_V(s, nir_copy_prop);
2424 NIR_PASS_V(s, nir_opt_cse);
2425 NIR_PASS_V(s, nir_opt_dce);
2426 NIR_PASS_V(s, nir_opt_shrink_stores, true);
2427
2428 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
2429
2430 /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
2431 NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
2432
2433 NIR_PASS_V(s, nir_lower_frexp);
2434
2435 do {
2436 progress = false;
2437 NIR_PASS(progress, s, nir_opt_algebraic_late);
2438 if (progress) {
2439 NIR_PASS_V(s, nir_copy_prop);
2440 NIR_PASS_V(s, nir_opt_dce);
2441 NIR_PASS_V(s, nir_opt_cse);
2442 }
2443 } while (progress);
2444
2445 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2446 NIR_PASS_V(s, r300_nir_prepare_presubtract);
2447 }
2448
2449 NIR_PASS_V(s, nir_lower_int_to_float);
2450 NIR_PASS_V(s, nir_copy_prop);
2451 NIR_PASS_V(s, r300_nir_post_integer_lowering);
2452 NIR_PASS_V(s, nir_lower_bool_to_float,
2453 !options->lower_cmp && !options->lower_fabs);
2454 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
2455 NIR_PASS_V(s, nir_copy_prop);
2456 /* CSE cleanup after late ftrunc lowering. */
2457 NIR_PASS_V(s, nir_opt_cse);
2458 /* At this point we need to clean;
2459 * a) fcsel_gt that come from the ftrunc lowering on R300,
2460 * b) all flavours of fcsels that read three different temp sources on R500.
2461 */
2462 if (s->info.stage == MESA_SHADER_VERTEX) {
2463 if (is_r500)
2464 NIR_PASS_V(s, r300_nir_lower_fcsel_r500);
2465 else
2466 NIR_PASS_V(s, r300_nir_lower_fcsel_r300);
2467 NIR_PASS_V(s, r300_nir_lower_flrp);
2468 } else {
2469 NIR_PASS_V(s, r300_nir_lower_comparison_fs);
2470 }
2471 NIR_PASS_V(s, r300_nir_opt_algebraic_late);
2472 NIR_PASS_V(s, nir_opt_dce);
2473
2474 nir_move_options move_all =
2475 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
2476 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
2477
2478 NIR_PASS_V(s, nir_opt_move, move_all);
2479 NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true);
2480 /* Late vectorizing after nir_move_vec_src_uses_to_dest helps instructions but
2481 * increases register usage. Testing shows this is beneficial only in VS.
2482 */
2483 if (s->info.stage == MESA_SHADER_VERTEX)
2484 NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
2485
2486 NIR_PASS_V(s, nir_convert_from_ssa, true);
2487 NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL);
2488
2489 /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
2490 */
2491 NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
2492 NIR_PASS_V(s, nir_opt_dce);
2493
2494 /* See comment in ntr_get_alu_src for supported modifiers */
2495 NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
2496
2497 if (NIR_DEBUG(TGSI)) {
2498 fprintf(stderr, "NIR before translation to TGSI:\n");
2499 nir_print_shader(s, stderr);
2500 }
2501
2502 c = rzalloc(NULL, struct ntr_compile);
2503 c->screen = screen;
2504 c->options = options;
2505
2506 c->s = s;
2507 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2508 ureg_setup_shader_info(c->ureg, &s->info);
2509 if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
2510 ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
2511
2512 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2513 /* The draw module's polygon stipple layer doesn't respect the chosen
2514 * coordinate mode, so leave it as unspecified unless we're actually
2515 * reading the position in the shader already. See
2516 * gl-2.1-polygon-stipple-fs on softpipe.
2517 */
2518 if ((s->info.inputs_read & VARYING_BIT_POS) ||
2519 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2520 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2521 s->info.fs.origin_upper_left ?
2522 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
2523 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2524
2525 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2526 s->info.fs.pixel_center_integer ?
2527 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
2528 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2529 }
2530 }
2531 /* Emit the main function */
2532 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2533 ntr_emit_impl(c, impl);
2534 ureg_END(c->ureg);
2535
2536 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2537
2538 if (NIR_DEBUG(TGSI)) {
2539 fprintf(stderr, "TGSI after translation from NIR:\n");
2540 tgsi_dump(tgsi_tokens, 0);
2541 }
2542
2543 ureg_destroy(c->ureg);
2544
2545 ralloc_free(c);
2546 ralloc_free(s);
2547
2548 return tgsi_tokens;
2549 }
2550