• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir_to_rc.h"
29 #include "r300_nir.h"
30 #include "r300_screen.h"
31 #include "pipe/p_screen.h"
32 #include "pipe/p_state.h"
33 #include "tgsi/tgsi_dump.h"
34 #include "tgsi/tgsi_from_mesa.h"
35 #include "tgsi/tgsi_info.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_ureg.h"
38 #include "tgsi/tgsi_util.h"
39 #include "util/u_debug.h"
40 #include "util/u_math.h"
41 #include "util/u_memory.h"
42 #include "util/u_dynarray.h"
43 
44 struct ntr_insn {
45    enum tgsi_opcode opcode;
46    struct ureg_dst dst[2];
47    struct ureg_src src[4];
48    enum tgsi_texture_type tex_target;
49    enum tgsi_return_type tex_return_type;
50    struct tgsi_texture_offset tex_offset[4];
51 
52    unsigned mem_qualifier;
53    enum pipe_format mem_format;
54 
55    bool is_tex : 1;
56    bool precise : 1;
57 };
58 
59 struct ntr_block {
60    /* Array of struct ntr_insn */
61    struct util_dynarray insns;
62    int start_ip;
63    int end_ip;
64 };
65 
66 struct ntr_reg_interval {
67    uint32_t start, end;
68 };
69 
70 struct ntr_compile {
71    nir_shader *s;
72    nir_function_impl *impl;
73    const struct nir_to_rc_options *options;
74    struct pipe_screen *screen;
75    struct ureg_program *ureg;
76 
77    bool addr_declared[3];
78    struct ureg_dst addr_reg[3];
79 
80    /* if condition set up at the end of a block, for ntr_emit_if(). */
81    struct ureg_src if_cond;
82 
83    /* TGSI temps for our NIR SSA and register values. */
84    struct ureg_dst *reg_temp;
85    struct ureg_src *ssa_temp;
86 
87    struct ntr_reg_interval *liveness;
88 
89    /* Map from nir_block to ntr_block */
90    struct hash_table *blocks;
91    struct ntr_block *cur_block;
92    unsigned current_if_else;
93    unsigned cf_label;
94 
95    /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
96    bool precise;
97 
98    unsigned num_temps;
99    unsigned first_non_array_temp;
100 
101    /* Mappings from driver_location to TGSI input/output number.
102     *
103     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
104     * their numbers assigned incrementally, unlike inputs or constants.
105     */
106    struct ureg_src *input_index_map;
107    uint64_t centroid_inputs;
108 
109    uint32_t first_ubo;
110 };
111 
112 static struct ureg_dst
ntr_temp(struct ntr_compile * c)113 ntr_temp(struct ntr_compile *c)
114 {
115    return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
116 }
117 
118 static struct ntr_block *
ntr_block_from_nir(struct ntr_compile * c,struct nir_block * block)119 ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block)
120 {
121    struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
122    return entry->data;
123 }
124 
125 static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list);
126 static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list);
127 
128 static struct ntr_insn *
ntr_insn(struct ntr_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)129 ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode,
130          struct ureg_dst dst,
131          struct ureg_src src0, struct ureg_src src1,
132          struct ureg_src src2, struct ureg_src src3)
133 {
134    struct ntr_insn insn = {
135       .opcode = opcode,
136       .dst = { dst, ureg_dst_undef() },
137       .src = { src0, src1, src2, src3 },
138       .precise = c->precise,
139    };
140    util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn);
141    return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn);
142 }
143 
144 #define OP00( op )                                                                     \
145 static inline void ntr_##op(struct ntr_compile *c)                                     \
146 {                                                                                      \
147    ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
148 }
149 
150 #define OP01( op )                                                                     \
151 static inline void ntr_##op(struct ntr_compile *c,                                     \
152                      struct ureg_src src0)                                             \
153 {                                                                                      \
154    ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
155 }
156 
157 
158 #define OP10( op )                                                                     \
159 static inline void ntr_##op(struct ntr_compile *c,                                     \
160                      struct ureg_dst dst)                                              \
161 {                                                                                      \
162    ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
163 }
164 
165 #define OP11( op )                                                                     \
166 static inline void ntr_##op(struct ntr_compile *c,                                     \
167                      struct ureg_dst dst,                                              \
168                      struct ureg_src src0)                                             \
169 {                                                                                      \
170    ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
171 }
172 
173 #define OP12( op )                                                                     \
174 static inline void ntr_##op(struct ntr_compile *c,                                     \
175                      struct ureg_dst dst,                                              \
176                      struct ureg_src src0,                                             \
177                      struct ureg_src src1)                                             \
178 {                                                                                      \
179    ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
180 }
181 
182 #define OP13( op )                                                                     \
183 static inline void ntr_##op(struct ntr_compile *c,                                     \
184                      struct ureg_dst dst,                                              \
185                      struct ureg_src src0,                                             \
186                      struct ureg_src src1,                                             \
187                      struct ureg_src src2)                                             \
188 {                                                                                      \
189    ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());             \
190 }
191 
192 #define OP14( op )                                                                     \
193 static inline void ntr_##op(struct ntr_compile *c,                                     \
194                      struct ureg_dst dst,                                              \
195                      struct ureg_src src0,                                             \
196                      struct ureg_src src1,                                             \
197                      struct ureg_src src2,                                             \
198                      struct ureg_src src3)                                             \
199 {                                                                                      \
200    ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                         \
201 }
202 
203 /* We hand-craft our tex instructions */
204 #define OP12_TEX(op)
205 #define OP14_TEX(op)
206 
207 /* Use a template include to generate a correctly-typed ntr_OP()
208  * function for each TGSI opcode:
209  */
210 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
211 
212 /**
213  * Interprets a nir_load_const used as a NIR src as a uint.
214  *
215  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
216  * instruction (or in a phi-web used by an integer ALU instruction) were
217  * converted to floats and the ALU instruction swapped to the float equivalent.
218  * However, this means that integer load_consts used by intrinsics (which don't
219  * normally get that conversion) may have been reformatted to be floats.  Given
220  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
221  * we can just look and see if they look like floats and convert them back to
222  * ints.
223  */
224 static uint32_t
ntr_src_as_uint(struct ntr_compile * c,nir_src src)225 ntr_src_as_uint(struct ntr_compile *c, nir_src src)
226 {
227    uint32_t val = nir_src_as_uint(src);
228    if (val >= fui(1.0))
229       val = (uint32_t)uif(val);
230    return val;
231 }
232 
233 /* Per-channel masks of def/use within the block, and the per-channel
234  * livein/liveout for the block as a whole.
235  */
236 struct ntr_live_reg_block_state {
237    uint8_t *def, *use, *livein, *liveout, *defin, *defout;
238 };
239 
240 struct ntr_live_reg_state {
241    unsigned bitset_words;
242 
243    struct ntr_reg_interval *regs;
244 
245    /* Used in propagate_across_edge() */
246    BITSET_WORD *tmp_live;
247 
248    struct ntr_live_reg_block_state *blocks;
249 
250    nir_block_worklist worklist;
251 };
252 
253 static void
ntr_live_reg_mark_use(struct ntr_compile * c,struct ntr_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)254 ntr_live_reg_mark_use(struct ntr_compile *c, struct ntr_live_reg_block_state *bs,
255                       int ip, unsigned index, unsigned used_mask)
256 {
257    bs->use[index] |= used_mask & ~bs->def[index];
258 
259    c->liveness[index].start = MIN2(c->liveness[index].start, ip);
260    c->liveness[index].end = MAX2(c->liveness[index].end, ip);
261 
262 }
263 static void
ntr_live_reg_setup_def_use(struct ntr_compile * c,nir_function_impl * impl,struct ntr_live_reg_state * state)264 ntr_live_reg_setup_def_use(struct ntr_compile *c, nir_function_impl *impl, struct ntr_live_reg_state *state)
265 {
266    for (int i = 0; i < impl->num_blocks; i++) {
267       state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
268       state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
269       state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
270       state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
271       state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
272       state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
273    }
274 
275    int ip = 0;
276    nir_foreach_block(block, impl) {
277       struct ntr_live_reg_block_state *bs = &state->blocks[block->index];
278       struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
279 
280       ntr_block->start_ip = ip;
281 
282       util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
283          const struct tgsi_opcode_info *opcode_info =
284             tgsi_get_opcode_info(insn->opcode);
285 
286          /* Set up use[] for the srcs.
287           *
288           * Uses are the channels of the reg read in the block that don't have a
289           * preceding def to screen them off.  Note that we don't do per-element
290           * tracking of array regs, so they're never screened off.
291           */
292          for (int i = 0; i < opcode_info->num_src; i++) {
293             if (insn->src[i].File != TGSI_FILE_TEMPORARY)
294                continue;
295             int index = insn->src[i].Index;
296 
297             uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
298                                                               insn->dst->WriteMask,
299                                                               insn->src[i].SwizzleX,
300                                                               insn->src[i].SwizzleY,
301                                                               insn->src[i].SwizzleZ,
302                                                               insn->src[i].SwizzleW,
303                                                               insn->tex_target,
304                                                               insn->tex_target);
305 
306             assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
307             ntr_live_reg_mark_use(c, bs, ip, index, used_mask);
308          }
309 
310          if (insn->is_tex) {
311             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
312                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
313                   ntr_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
314             }
315          }
316 
317          /* Set up def[] for the srcs.
318           *
319           * Defs are the unconditionally-written (not R/M/W) channels of the reg in
320           * the block that don't have a preceding use.
321           */
322          for (int i = 0; i < opcode_info->num_dst; i++) {
323             if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
324                continue;
325             int index = insn->dst[i].Index;
326             uint32_t writemask = insn->dst[i].WriteMask;
327 
328             bs->def[index] |= writemask & ~bs->use[index];
329             bs->defout[index] |= writemask;
330 
331             assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
332             c->liveness[index].start = MIN2(c->liveness[index].start, ip);
333             c->liveness[index].end = MAX2(c->liveness[index].end, ip);
334          }
335          ip++;
336       }
337 
338       ntr_block->end_ip = ip;
339    }
340 }
341 
342 static void
ntr_live_regs(struct ntr_compile * c,nir_function_impl * impl)343 ntr_live_regs(struct ntr_compile *c, nir_function_impl *impl)
344 {
345    nir_metadata_require(impl, nir_metadata_block_index);
346 
347    c->liveness = rzalloc_array(c, struct ntr_reg_interval, c->num_temps);
348 
349    struct ntr_live_reg_state state = {
350        .blocks = rzalloc_array(impl, struct ntr_live_reg_block_state, impl->num_blocks),
351    };
352 
353    /* The intervals start out with start > end (indicating unused) */
354    for (int i = 0; i < c->num_temps; i++)
355       c->liveness[i].start = ~0;
356 
357    ntr_live_reg_setup_def_use(c, impl, &state);
358 
359    /* Make a forward-order worklist of all the blocks. */
360    nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
361    nir_foreach_block(block, impl) {
362       nir_block_worklist_push_tail(&state.worklist, block);
363    }
364 
365    /* Propagate defin/defout down the CFG to calculate the live variables
366     * potentially defined along any possible control flow path.  We'll use this
367     * to keep things like conditional defs of the reg (or array regs where we
368     * don't track defs!) from making the reg's live range extend back to the
369     * start of the program.
370     */
371    while (!nir_block_worklist_is_empty(&state.worklist)) {
372       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
373       for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
374          nir_block *succ = block->successors[j];
375          if (!succ || succ->index == impl->num_blocks)
376             continue;
377 
378          for (int i = 0; i < c->num_temps; i++) {
379             uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
380 
381             if (new_def) {
382                state.blocks[succ->index].defin[i] |= new_def;
383                state.blocks[succ->index].defout[i] |= new_def;
384                nir_block_worklist_push_tail(&state.worklist, succ);
385             }
386          }
387       }
388    }
389 
390    /* Make a reverse-order worklist of all the blocks. */
391    nir_foreach_block(block, impl) {
392       nir_block_worklist_push_head(&state.worklist, block);
393    }
394 
395    /* We're now ready to work through the worklist and update the liveness sets
396     * of each of the blocks.  As long as we keep the worklist up-to-date as we
397     * go, everything will get covered.
398     */
399    while (!nir_block_worklist_is_empty(&state.worklist)) {
400       /* We pop them off in the reverse order we pushed them on.  This way
401        * the first walk of the instructions is backwards so we only walk
402        * once in the case of no control flow.
403        */
404       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
405       struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
406       struct ntr_live_reg_block_state *bs = &state.blocks[block->index];
407 
408       for (int i = 0; i < c->num_temps; i++) {
409          /* Collect livein from our successors to include in our liveout. */
410          for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
411             nir_block *succ = block->successors[j];
412             if (!succ || succ->index == impl->num_blocks)
413                continue;
414             struct ntr_live_reg_block_state *sbs = &state.blocks[succ->index];
415 
416             uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
417             if (new_liveout) {
418                if (state.blocks[block->index].defout[i])
419                   c->liveness[i].end = MAX2(c->liveness[i].end, ntr_block->end_ip);
420                bs->liveout[i] |= sbs->livein[i];
421             }
422          }
423 
424          /* Propagate use requests from either our block's uses or our
425           * non-screened-off liveout up to our predecessors.
426           */
427          uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
428                                ~bs->livein[i]);
429          if (new_livein) {
430             bs->livein[i] |= new_livein;
431             set_foreach(block->predecessors, entry) {
432                nir_block *pred = (void *)entry->key;
433                nir_block_worklist_push_tail(&state.worklist, pred);
434             }
435 
436             if (new_livein & state.blocks[block->index].defin[i])
437                c->liveness[i].start = MIN2(c->liveness[i].start, ntr_block->start_ip);
438          }
439       }
440    }
441 
442    ralloc_free(state.blocks);
443    nir_block_worklist_fini(&state.worklist);
444 }
445 
446 static void
ntr_ra_check(struct ntr_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)447 ntr_ra_check(struct ntr_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
448 {
449    if (index < c->first_non_array_temp)
450       return;
451 
452    if (c->liveness[index].start == ip && ra_map[index] == ~0)
453       ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
454 
455    if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
456       ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
457       BITSET_SET(released, index);
458    }
459 }
460 
461 static void
ntr_allocate_regs(struct ntr_compile * c,nir_function_impl * impl)462 ntr_allocate_regs(struct ntr_compile *c, nir_function_impl *impl)
463 {
464    ntr_live_regs(c, impl);
465 
466    unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
467    unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
468 
469    /* No RA on NIR array regs */
470    for (int i = 0; i < c->first_non_array_temp; i++)
471       ra_map[i] = i;
472 
473    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
474       ra_map[i] = ~0;
475 
476    int ip = 0;
477    nir_foreach_block(block, impl) {
478       struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
479 
480       for (int i = 0; i < c->num_temps; i++)
481          ntr_ra_check(c, ra_map, released, ip, i);
482 
483       util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
484          const struct tgsi_opcode_info *opcode_info =
485             tgsi_get_opcode_info(insn->opcode);
486 
487          for (int i = 0; i < opcode_info->num_src; i++) {
488             if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
489                ntr_ra_check(c, ra_map, released, ip, insn->src[i].Index);
490                insn->src[i].Index = ra_map[insn->src[i].Index];
491             }
492          }
493 
494          if (insn->is_tex) {
495             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
496                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
497                   ntr_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
498                   insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
499                }
500             }
501          }
502 
503          for (int i = 0; i < opcode_info->num_dst; i++) {
504             if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
505                ntr_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
506                insn->dst[i].Index = ra_map[insn->dst[i].Index];
507             }
508          }
509          ip++;
510       }
511 
512       for (int i = 0; i < c->num_temps; i++)
513          ntr_ra_check(c, ra_map, released, ip, i);
514    }
515 }
516 
517 static void
ntr_allocate_regs_unoptimized(struct ntr_compile * c,nir_function_impl * impl)518 ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl)
519 {
520    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
521       ureg_DECL_temporary(c->ureg);
522 }
523 
524 /* TGSI varying declarations have a component usage mask associated (used by
525  * r600 and svga).
526  */
527 static uint32_t
ntr_tgsi_var_usage_mask(const struct nir_variable * var)528 ntr_tgsi_var_usage_mask(const struct nir_variable *var)
529 {
530    const struct glsl_type *type_without_array =
531       glsl_without_array(var->type);
532    unsigned num_components = glsl_get_vector_elements(type_without_array);
533    if (num_components == 0) /* structs */
534       num_components = 4;
535 
536    return u_bit_consecutive(var->data.location_frac, num_components);
537 }
538 
539 static struct ureg_dst
ntr_output_decl(struct ntr_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)540 ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
541 {
542    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
543    int base = nir_intrinsic_base(instr);
544    *frac = nir_intrinsic_component(instr);
545 
546    struct ureg_dst out;
547    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
548       unsigned semantic_name, semantic_index;
549       tgsi_get_gl_frag_result_semantic(semantics.location,
550                                        &semantic_name, &semantic_index);
551       semantic_index += semantics.dual_source_blend_index;
552 
553       switch (semantics.location) {
554       case FRAG_RESULT_DEPTH:
555          *frac = 2; /* z write is the to the .z channel in TGSI */
556          break;
557       case FRAG_RESULT_STENCIL:
558          *frac = 1;
559          break;
560       default:
561          break;
562       }
563 
564       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
565    } else {
566       unsigned semantic_name, semantic_index;
567 
568       tgsi_get_gl_varying_semantic(semantics.location, true,
569                                    &semantic_name, &semantic_index);
570 
571       uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components);
572       uint32_t gs_streams = semantics.gs_streams;
573       for (int i = 0; i < 4; i++) {
574          if (!(usage_mask & (1 << i)))
575             gs_streams &= ~(0x3 << 2 * i);
576       }
577 
578       /* No driver appears to use array_id of outputs. */
579       unsigned array_id = 0;
580 
581       /* This bit is lost in the i/o semantics, but it's unused in in-tree
582        * drivers.
583        */
584       bool invariant = semantics.invariant;
585 
586       out = ureg_DECL_output_layout(c->ureg,
587                                     semantic_name, semantic_index,
588                                     gs_streams,
589                                     base,
590                                     usage_mask,
591                                     array_id,
592                                     semantics.num_slots,
593                                     invariant);
594    }
595 
596    unsigned write_mask;
597    if (nir_intrinsic_has_write_mask(instr))
598       write_mask = nir_intrinsic_write_mask(instr);
599    else
600       write_mask = ((1 << instr->num_components) - 1) << *frac;
601 
602    write_mask = write_mask << *frac;
603    return ureg_writemask(out, write_mask);
604 }
605 
606 static bool
ntr_try_store_in_tgsi_output_with_use(struct ntr_compile * c,struct ureg_dst * dst,nir_src * src)607 ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c,
608                                       struct ureg_dst *dst,
609                                       nir_src *src)
610 {
611    *dst = ureg_dst_undef();
612 
613    if (nir_src_is_if(src))
614       return false;
615 
616    if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
617       return false;
618 
619    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
620    if (intr->intrinsic != nir_intrinsic_store_output ||
621        !nir_src_is_const(intr->src[1])) {
622       return false;
623    }
624 
625    uint32_t frac;
626    *dst = ntr_output_decl(c, intr, &frac);
627    dst->Index += ntr_src_as_uint(c, intr->src[1]);
628 
629    return frac == 0;
630 }
631 
632 /* If this reg is used only for storing an output, then in the simple
633  * cases we can write directly to the TGSI output instead of having
634  * store_output emit its own MOV.
635  */
636 static bool
ntr_try_store_reg_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)637 ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
638                                  nir_intrinsic_instr *reg_decl)
639 {
640    assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
641 
642    *dst = ureg_dst_undef();
643 
644    /* Look for a single use for try_store_in_tgsi_output */
645    nir_src *use = NULL;
646    nir_foreach_reg_load(src, reg_decl) {
647       nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
648       nir_foreach_use_including_if(load_use, &load->def) {
649          /* We can only have one use */
650          if (use != NULL)
651             return false;
652 
653          use = load_use;
654       }
655    }
656 
657    if (use == NULL)
658       return false;
659 
660    return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
661 }
662 
663 /* If this SSA def is used only for storing an output, then in the simple
664  * cases we can write directly to the TGSI output instead of having
665  * store_output emit its own MOV.
666  */
667 static bool
ntr_try_store_ssa_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_def * def)668 ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
669                                  nir_def *def)
670 {
671    *dst = ureg_dst_undef();
672 
673    if (!list_is_singular(&def->uses))
674       return false;
675 
676    nir_foreach_use_including_if(use, def) {
677       return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
678    }
679    unreachable("We have one use");
680 }
681 
682 static void
ntr_setup_inputs(struct ntr_compile * c)683 ntr_setup_inputs(struct ntr_compile *c)
684 {
685    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
686       return;
687 
688    unsigned num_inputs = 0;
689    int num_input_arrays = 0;
690 
691    nir_foreach_shader_in_variable(var, c->s) {
692       const struct glsl_type *type = var->type;
693       unsigned array_len =
694          glsl_count_attribute_slots(type, false);
695 
696       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
697    }
698 
699    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
700 
701    nir_foreach_shader_in_variable(var, c->s) {
702       const struct glsl_type *type = var->type;
703       unsigned array_len =
704          glsl_count_attribute_slots(type, false);
705 
706       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
707       unsigned sample_loc;
708       struct ureg_src decl;
709 
710       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
711          interpolation =
712             tgsi_get_interp_mode(var->data.interpolation,
713                                  var->data.location == VARYING_SLOT_COL0 ||
714                                  var->data.location == VARYING_SLOT_COL1);
715 
716          if (var->data.location == VARYING_SLOT_POS)
717             interpolation = TGSI_INTERPOLATE_LINEAR;
718       }
719 
720       unsigned semantic_name, semantic_index;
721       tgsi_get_gl_varying_semantic(var->data.location, true,
722                                    &semantic_name, &semantic_index);
723 
724       if (var->data.sample) {
725          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
726       } else if (var->data.centroid) {
727          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
728          c->centroid_inputs |= (BITSET_MASK(array_len) <<
729                                 var->data.driver_location);
730       } else {
731          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
732       }
733 
734       unsigned array_id = 0;
735       if (glsl_type_is_array(type))
736          array_id = ++num_input_arrays;
737 
738       uint32_t usage_mask = ntr_tgsi_var_usage_mask(var);
739 
740       decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
741                                                 semantic_name,
742                                                 semantic_index,
743                                                 interpolation,
744                                                 sample_loc,
745                                                 var->data.driver_location,
746                                                 usage_mask,
747                                                 array_id, array_len);
748 
749       if (semantic_name == TGSI_SEMANTIC_FACE) {
750          struct ureg_dst temp = ntr_temp(c);
751          /* tgsi docs say that floating point FACE will be positive for
752           * frontface and negative for backface, but realistically
753           * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
754           * Copy that behavior, since some drivers (r300) have been doing a
755           * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
756           * front face).
757           */
758          temp.Saturate = true;
759          ntr_MOV(c, temp, decl);
760          decl = ureg_src(temp);
761       }
762 
763       for (unsigned i = 0; i < array_len; i++) {
764          c->input_index_map[var->data.driver_location + i] = decl;
765          c->input_index_map[var->data.driver_location + i].Index += i;
766       }
767    }
768 }
769 
770 static int
ntr_sort_by_location(const nir_variable * a,const nir_variable * b)771 ntr_sort_by_location(const nir_variable *a, const nir_variable *b)
772 {
773    return a->data.location - b->data.location;
774 }
775 
776 /**
777  * Workaround for virglrenderer requiring that TGSI FS output color variables
778  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
779  */
780 static void
ntr_setup_outputs(struct ntr_compile * c)781 ntr_setup_outputs(struct ntr_compile *c)
782 {
783    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
784       return;
785 
786    nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out);
787 
788    nir_foreach_shader_out_variable(var, c->s) {
789       if (var->data.location == FRAG_RESULT_COLOR)
790          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
791 
792       unsigned semantic_name, semantic_index;
793       tgsi_get_gl_frag_result_semantic(var->data.location,
794                                        &semantic_name, &semantic_index);
795 
796       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
797    }
798 }
799 
800 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)801 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
802 {
803    switch (dim) {
804    case GLSL_SAMPLER_DIM_1D:
805       if (is_shadow)
806          return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
807       else
808          return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
809    case GLSL_SAMPLER_DIM_2D:
810    case GLSL_SAMPLER_DIM_EXTERNAL:
811       if (is_shadow)
812          return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
813       else
814          return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
815    case GLSL_SAMPLER_DIM_3D:
816       return TGSI_TEXTURE_3D;
817    case GLSL_SAMPLER_DIM_CUBE:
818       if (is_shadow)
819          return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
820       else
821          return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
822    case GLSL_SAMPLER_DIM_RECT:
823       if (is_shadow)
824          return TGSI_TEXTURE_SHADOWRECT;
825       else
826          return TGSI_TEXTURE_RECT;
827    case GLSL_SAMPLER_DIM_MS:
828       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
829    case GLSL_SAMPLER_DIM_BUF:
830       return TGSI_TEXTURE_BUFFER;
831    default:
832       unreachable("unknown sampler dim");
833    }
834 }
835 
836 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)837 tgsi_return_type_from_base_type(enum glsl_base_type type)
838 {
839    switch (type) {
840    case GLSL_TYPE_INT:
841       return TGSI_RETURN_TYPE_SINT;
842    case GLSL_TYPE_UINT:
843       return TGSI_RETURN_TYPE_UINT;
844    case GLSL_TYPE_FLOAT:
845      return TGSI_RETURN_TYPE_FLOAT;
846    default:
847       unreachable("unexpected texture type");
848    }
849 }
850 
851 static void
ntr_setup_uniforms(struct ntr_compile * c)852 ntr_setup_uniforms(struct ntr_compile *c)
853 {
854    nir_foreach_uniform_variable(var, c->s) {
855       if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
856           glsl_type_is_texture(glsl_without_array(var->type))) {
857          /* Don't use this size for the check for samplers -- arrays of structs
858           * containing samplers should be ignored, and just the separate lowered
859           * sampler uniform decl used.
860           */
861          int size = glsl_type_get_sampler_count(var->type) +
862                     glsl_type_get_texture_count(var->type);
863 
864          const struct glsl_type *stype = glsl_without_array(var->type);
865          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
866                                                                             glsl_sampler_type_is_array(stype),
867                                                                             glsl_sampler_type_is_shadow(stype));
868          enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
869          for (int i = 0; i < size; i++) {
870             ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
871                target, ret_type, ret_type, ret_type, ret_type);
872             ureg_DECL_sampler(c->ureg, var->data.binding + i);
873          }
874 
875       /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
876        * size declaration happens with other UBOs below.
877        */
878       }
879    }
880 
881    c->first_ubo = ~0;
882 
883    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
884    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
885       int ubo = var->data.driver_location;
886       if (ubo == -1)
887          continue;
888 
889       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
890          c->first_ubo = MIN2(c->first_ubo, ubo);
891 
892       unsigned size = glsl_get_explicit_size(var->interface_type, false);
893       ubo_sizes[ubo] = size;
894    }
895 
896    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
897       if (ubo_sizes[i])
898          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
899    }
900 }
901 
902 static void
ntr_setup_registers(struct ntr_compile * c)903 ntr_setup_registers(struct ntr_compile *c)
904 {
905    assert(c->num_temps == 0);
906 
907    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
908       /* Permanently allocate all the array regs at the start. */
909       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
910       unsigned index = nir_reg->def.index;
911 
912       if (num_array_elems != 0) {
913          struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
914          c->reg_temp[index] = decl;
915          assert(c->num_temps == decl.Index);
916          c->num_temps += num_array_elems;
917       }
918    }
919    c->first_non_array_temp = c->num_temps;
920 
921    /* After that, allocate non-array regs in our virtual space that we'll
922     * register-allocate before ureg emit.
923     */
924    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
925       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
926       unsigned num_components = nir_intrinsic_num_components(nir_reg);
927       unsigned index = nir_reg->def.index;
928 
929       /* We already handled arrays */
930       if (num_array_elems == 0) {
931          struct ureg_dst decl;
932          uint32_t write_mask = BITFIELD_MASK(num_components);
933 
934          if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
935             decl = ureg_writemask(ntr_temp(c), write_mask);
936          }
937          c->reg_temp[index] = decl;
938       }
939    }
940 }
941 
942 static struct ureg_src
ntr_get_load_const_src(struct ntr_compile * c,nir_load_const_instr * instr)943 ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr)
944 {
945    int num_components = instr->def.num_components;
946 
947    float values[4];
948    assert(instr->def.bit_size == 32);
949    for (int i = 0; i < num_components; i++)
950       values[i] = uif(instr->value[i].u32);
951 
952    return ureg_DECL_immediate(c->ureg, values, num_components);
953 }
954 
955 static struct ureg_src
ntr_reladdr(struct ntr_compile * c,struct ureg_src addr,int addr_index)956 ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index)
957 {
958    assert(addr_index < ARRAY_SIZE(c->addr_reg));
959 
960    for (int i = 0; i <= addr_index; i++) {
961       if (!c->addr_declared[i]) {
962          c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
963                                              TGSI_WRITEMASK_X);
964          c->addr_declared[i] = true;
965       }
966    }
967 
968    ntr_ARL(c, c->addr_reg[addr_index], addr);
969    return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
970 }
971 
972 /* Forward declare for recursion with indirects */
973 static struct ureg_src
974 ntr_get_src(struct ntr_compile *c, nir_src src);
975 
976 static struct ureg_src
ntr_get_chased_src(struct ntr_compile * c,nir_legacy_src * src)977 ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src)
978 {
979    if (src->is_ssa) {
980       if (src->ssa->parent_instr->type == nir_instr_type_load_const)
981          return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
982 
983       return c->ssa_temp[src->ssa->index];
984    } else {
985       struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
986       reg_temp.Index += src->reg.base_offset;
987 
988       if (src->reg.indirect) {
989          struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect));
990          return ureg_src_indirect(ureg_src(reg_temp),
991                                   ntr_reladdr(c, offset, 0));
992       } else {
993          return ureg_src(reg_temp);
994       }
995    }
996 }
997 
998 static struct ureg_src
ntr_get_src(struct ntr_compile * c,nir_src src)999 ntr_get_src(struct ntr_compile *c, nir_src src)
1000 {
1001    nir_legacy_src chased = nir_legacy_chase_src(&src);
1002    return ntr_get_chased_src(c, &chased);
1003 }
1004 
1005 static struct ureg_src
ntr_get_alu_src(struct ntr_compile * c,nir_alu_instr * instr,int i)1006 ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i)
1007 {
1008    /* We only support 32-bit float modifiers.  The only other modifier type
1009     * officially supported by TGSI is 32-bit integer negates, but even those are
1010     * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1011     *
1012     * The options->lower_fabs requests that we not have native source modifiers
1013     * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1014     */
1015    nir_legacy_alu_src src =
1016       nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1017    struct ureg_src usrc = ntr_get_chased_src(c, &src.src);
1018 
1019    usrc = ureg_swizzle(usrc,
1020                        src.swizzle[0],
1021                        src.swizzle[1],
1022                        src.swizzle[2],
1023                        src.swizzle[3]);
1024 
1025    if (src.fabs)
1026       usrc = ureg_abs(usrc);
1027    if (src.fneg)
1028       usrc = ureg_negate(usrc);
1029 
1030    return usrc;
1031 }
1032 
1033 /* Reswizzles a source so that the unset channels in the write mask still refer
1034  * to one of the channels present in the write mask.
1035  */
1036 static struct ureg_src
ntr_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1037 ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1038 {
1039    assert(write_mask);
1040    int first_chan = ffs(write_mask) - 1;
1041    return ureg_swizzle(src,
1042                        (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1043                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1044                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1045                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1046 }
1047 
1048 static struct ureg_dst
ntr_get_ssa_def_decl(struct ntr_compile * c,nir_def * ssa)1049 ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa)
1050 {
1051    uint32_t writemask;
1052    /* Fix writemask for nir_intrinsic_load_ubo_vec4 accoring to uses. */
1053    if (ssa->parent_instr->type == nir_instr_type_intrinsic &&
1054        nir_instr_as_intrinsic(ssa->parent_instr)->intrinsic == nir_intrinsic_load_ubo_vec4)
1055       writemask = nir_def_components_read(ssa);
1056    else
1057       writemask = BITSET_MASK(ssa->num_components);
1058 
1059    struct ureg_dst dst;
1060    if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1061       dst = ntr_temp(c);
1062 
1063    c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask);
1064 
1065    return ureg_writemask(dst, writemask);
1066 }
1067 
1068 static struct ureg_dst
ntr_get_chased_dest_decl(struct ntr_compile * c,nir_legacy_dest * dest)1069 ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest)
1070 {
1071    if (dest->is_ssa)
1072       return ntr_get_ssa_def_decl(c, dest->ssa);
1073    else
1074       return c->reg_temp[dest->reg.handle->index];
1075 }
1076 
1077 static struct ureg_dst
ntr_get_chased_dest(struct ntr_compile * c,nir_legacy_dest * dest)1078 ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest)
1079 {
1080    struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest);
1081 
1082    if (!dest->is_ssa) {
1083       dst.Index += dest->reg.base_offset;
1084 
1085       if (dest->reg.indirect) {
1086          struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1087          dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0));
1088       }
1089    }
1090 
1091    return dst;
1092 }
1093 
1094 static struct ureg_dst
ntr_get_dest(struct ntr_compile * c,nir_def * def)1095 ntr_get_dest(struct ntr_compile *c, nir_def *def)
1096 {
1097    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1098    return ntr_get_chased_dest(c, &chased);
1099 }
1100 
1101 static struct ureg_dst
ntr_get_alu_dest(struct ntr_compile * c,nir_def * def)1102 ntr_get_alu_dest(struct ntr_compile *c, nir_def *def)
1103 {
1104    nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1105    struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest);
1106 
1107    if (chased.fsat)
1108       dst.Saturate = true;
1109 
1110    /* Only registers get write masks */
1111    if (chased.dest.is_ssa)
1112       return dst;
1113 
1114    return ureg_writemask(dst, chased.write_mask);
1115 }
1116 
1117 /* For an SSA dest being populated by a constant src, replace the storage with
1118  * a copy of the ureg_src.
1119  */
1120 static void
ntr_store_def(struct ntr_compile * c,nir_def * def,struct ureg_src src)1121 ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src)
1122 {
1123    if (!src.Indirect && !src.DimIndirect) {
1124       switch (src.File) {
1125       case TGSI_FILE_IMMEDIATE:
1126       case TGSI_FILE_INPUT:
1127       case TGSI_FILE_CONSTANT:
1128       case TGSI_FILE_SYSTEM_VALUE:
1129          c->ssa_temp[def->index] = src;
1130          return;
1131       }
1132    }
1133 
1134    ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src);
1135 }
1136 
1137 static void
ntr_store(struct ntr_compile * c,nir_def * def,struct ureg_src src)1138 ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src)
1139 {
1140    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1141 
1142    if (chased.is_ssa)
1143       ntr_store_def(c, chased.ssa, src);
1144    else {
1145       struct ureg_dst dst = ntr_get_chased_dest(c, &chased);
1146       ntr_MOV(c, dst, src);
1147    }
1148 }
1149 
1150 static void
ntr_emit_scalar(struct ntr_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1151 ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op,
1152                 struct ureg_dst dst,
1153                 struct ureg_src src0,
1154                 struct ureg_src src1)
1155 {
1156    unsigned i;
1157 
1158    /* POW is the only 2-operand scalar op. */
1159    if (tgsi_op != TGSI_OPCODE_POW)
1160       src1 = src0;
1161 
1162    for (i = 0; i < 4; i++) {
1163       if (dst.WriteMask & (1 << i)) {
1164          ntr_insn(c, tgsi_op,
1165                   ureg_writemask(dst, 1 << i),
1166                   ureg_scalar(src0, i),
1167                   ureg_scalar(src1, i),
1168                   ureg_src_undef(), ureg_src_undef());
1169       }
1170    }
1171 }
1172 
1173 static void
ntr_emit_alu(struct ntr_compile * c,nir_alu_instr * instr)1174 ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr)
1175 {
1176    struct ureg_src src[4];
1177    struct ureg_dst dst;
1178    unsigned i;
1179    int num_srcs = nir_op_infos[instr->op].num_inputs;
1180 
1181    /* Don't try to translate folded fsat since their source won't be valid */
1182    if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1183       return;
1184 
1185    c->precise = instr->exact;
1186 
1187    assert(num_srcs <= ARRAY_SIZE(src));
1188    for (i = 0; i < num_srcs; i++)
1189       src[i] = ntr_get_alu_src(c, instr, i);
1190    for (; i < ARRAY_SIZE(src); i++)
1191       src[i] = ureg_src_undef();
1192 
1193    dst = ntr_get_alu_dest(c, &instr->def);
1194 
1195    static enum tgsi_opcode op_map[] = {
1196       [nir_op_mov] = TGSI_OPCODE_MOV,
1197 
1198       [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2,
1199       [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3,
1200       [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4,
1201       [nir_op_ffloor] = TGSI_OPCODE_FLR,
1202       [nir_op_ffract] = TGSI_OPCODE_FRC,
1203       [nir_op_fceil] = TGSI_OPCODE_CEIL,
1204       [nir_op_fround_even] = TGSI_OPCODE_ROUND,
1205 
1206       [nir_op_slt] = TGSI_OPCODE_SLT,
1207       [nir_op_sge] = TGSI_OPCODE_SGE,
1208       [nir_op_seq] = TGSI_OPCODE_SEQ,
1209       [nir_op_sne] = TGSI_OPCODE_SNE,
1210 
1211       [nir_op_ftrunc] = TGSI_OPCODE_TRUNC,
1212       [nir_op_fddx] = TGSI_OPCODE_DDX,
1213       [nir_op_fddy] = TGSI_OPCODE_DDY,
1214       [nir_op_fddx_coarse] = TGSI_OPCODE_DDX,
1215       [nir_op_fddy_coarse] = TGSI_OPCODE_DDY,
1216       [nir_op_fadd] = TGSI_OPCODE_ADD,
1217       [nir_op_fmul] = TGSI_OPCODE_MUL,
1218 
1219       [nir_op_fmin] = TGSI_OPCODE_MIN,
1220       [nir_op_fmax] = TGSI_OPCODE_MAX,
1221       [nir_op_ffma] = TGSI_OPCODE_MAD,
1222    };
1223 
1224    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) {
1225       /* The normal path for NIR to TGSI ALU op translation */
1226       ntr_insn(c, op_map[instr->op],
1227                 dst, src[0], src[1], src[2], src[3]);
1228    } else {
1229       /* Special cases for NIR to TGSI ALU op translation. */
1230 
1231       /* TODO: Use something like the ntr_store() path for the MOV calls so we
1232        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1233        */
1234 
1235       switch (instr->op) {
1236       case nir_op_fabs:
1237          /* Try to eliminate */
1238          if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1239             break;
1240 
1241          if (c->options->lower_fabs)
1242             ntr_MAX(c, dst, src[0], ureg_negate(src[0]));
1243          else
1244             ntr_MOV(c, dst, ureg_abs(src[0]));
1245          break;
1246 
1247       case nir_op_fsat:
1248          ntr_MOV(c, ureg_saturate(dst), src[0]);
1249          break;
1250 
1251       case nir_op_fneg:
1252          /* Try to eliminate */
1253          if (nir_legacy_float_mod_folds(instr))
1254             break;
1255 
1256          ntr_MOV(c, dst, ureg_negate(src[0]));
1257          break;
1258 
1259          /* NOTE: TGSI 32-bit math ops have the old "one source channel
1260           * replicated to all dst channels" behavior, while 64 is normal mapping
1261           * of src channels to dst.
1262           */
1263       case nir_op_frcp:
1264          ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1265          break;
1266 
1267       case nir_op_frsq:
1268          ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1269          break;
1270 
1271       case nir_op_fexp2:
1272          ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1273          break;
1274 
1275       case nir_op_flog2:
1276          ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1277          break;
1278 
1279       case nir_op_fsin:
1280          ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1281          break;
1282 
1283       case nir_op_fcos:
1284          ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1285          break;
1286 
1287       case nir_op_fsub:
1288          ntr_ADD(c, dst, src[0], ureg_negate(src[1]));
1289          break;
1290 
1291       case nir_op_fmod:
1292          unreachable("should be handled by .lower_fmod = true");
1293          break;
1294 
1295       case nir_op_fpow:
1296          ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1297          break;
1298 
1299       case nir_op_flrp:
1300          ntr_LRP(c, dst, src[2], src[1], src[0]);
1301          break;
1302 
1303       case nir_op_fcsel:
1304          /* If CMP isn't supported, then the flags that enable NIR to generate
1305           * this opcode should also not be set.
1306           */
1307          assert(!c->options->lower_cmp);
1308 
1309          /* Implement this as CMP(-abs(src0), src1, src2). */
1310          ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1311          break;
1312 
1313       case nir_op_fcsel_gt:
1314          /* If CMP isn't supported, then the flags that enable NIR to generate
1315           * these opcodes should also not be set.
1316           */
1317          assert(!c->options->lower_cmp);
1318 
1319          ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1320          break;
1321 
1322       case nir_op_fcsel_ge:
1323          /* If CMP isn't supported, then the flags that enable NIR to generate
1324           * these opcodes should also not be set.
1325           */
1326          assert(!c->options->lower_cmp);
1327 
1328          /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1329          ntr_CMP(c, dst, src[0], src[2], src[1]);
1330          break;
1331 
1332       case nir_op_vec4:
1333       case nir_op_vec3:
1334       case nir_op_vec2:
1335          unreachable("covered by nir_lower_vec_to_movs()");
1336 
1337       default:
1338          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1339          unreachable("Unknown NIR opcode");
1340       }
1341    }
1342 
1343    c->precise = false;
1344 }
1345 
1346 static struct ureg_src
ntr_ureg_src_indirect(struct ntr_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1347 ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc,
1348                       nir_src src, int addr_reg)
1349 {
1350    if (nir_src_is_const(src)) {
1351       usrc.Index += ntr_src_as_uint(c, src);
1352       return usrc;
1353    } else {
1354       return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg));
1355    }
1356 }
1357 
1358 static struct ureg_dst
ntr_ureg_dst_indirect(struct ntr_compile * c,struct ureg_dst dst,nir_src src)1359 ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst,
1360                       nir_src src)
1361 {
1362    if (nir_src_is_const(src)) {
1363       dst.Index += ntr_src_as_uint(c, src);
1364       return dst;
1365    } else {
1366       return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0));
1367    }
1368 }
1369 
1370 static struct ureg_dst
ntr_ureg_dst_dimension_indirect(struct ntr_compile * c,struct ureg_dst udst,nir_src src)1371 ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst,
1372                                 nir_src src)
1373 {
1374    if (nir_src_is_const(src)) {
1375       return ureg_dst_dimension(udst, ntr_src_as_uint(c, src));
1376    } else {
1377       return ureg_dst_dimension_indirect(udst,
1378                                          ntr_reladdr(c, ntr_get_src(c, src), 1),
1379                                          0);
1380    }
1381 }
1382 /* Some load operations in NIR will have a fractional offset that we need to
1383  * swizzle down before storing to the result register.
1384  */
1385 static struct ureg_src
ntr_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1386 ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1387 {
1388    return ureg_swizzle(src,
1389                        frac,
1390                        frac + MIN2(num_components - 1, 1),
1391                        frac + MIN2(num_components - 1, 2),
1392                        frac + MIN2(num_components - 1, 3));
1393 }
1394 
1395 
1396 static void
ntr_emit_load_ubo(struct ntr_compile * c,nir_intrinsic_instr * instr)1397 ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr)
1398 {
1399    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1400 
1401    struct ureg_dst addr_temp = ureg_dst_undef();
1402 
1403    if (nir_src_is_const(instr->src[0])) {
1404       src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0]));
1405    } else {
1406       /* virglrenderer requires that indirect UBO references have the UBO
1407        * array's base index in the Index field, not added to the indrect
1408        * address.
1409        *
1410        * Many nir intrinsics have a base address const value for the start of
1411        * their array indirection, but load_ubo doesn't.  We fake it by
1412        * subtracting it off here.
1413        */
1414       addr_temp = ntr_temp(c);
1415       ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1416       src = ureg_src_dimension_indirect(src,
1417                                          ntr_reladdr(c, ureg_src(addr_temp), 1),
1418                                          c->first_ubo);
1419    }
1420 
1421    /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1422     * file.
1423     */
1424    src.Index = nir_intrinsic_base(instr);
1425 
1426    if (nir_src_is_const(instr->src[1])) {
1427       src.Index += ntr_src_as_uint(c, instr->src[1]);
1428    } else {
1429       src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0));
1430    }
1431 
1432    int start_component = nir_intrinsic_component(instr);
1433 
1434    src = ntr_shift_by_frac(src, start_component, instr->num_components);
1435 
1436    ntr_store(c, &instr->def, src);
1437 }
1438 
1439 static void
ntr_emit_load_input(struct ntr_compile * c,nir_intrinsic_instr * instr)1440 ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr)
1441 {
1442    uint32_t frac = nir_intrinsic_component(instr);
1443    uint32_t num_components = instr->num_components;
1444    unsigned base = nir_intrinsic_base(instr);
1445    struct ureg_src input;
1446    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1447 
1448    if (c->s->info.stage == MESA_SHADER_VERTEX) {
1449       input = ureg_DECL_vs_input(c->ureg, base);
1450       for (int i = 1; i < semantics.num_slots; i++)
1451          ureg_DECL_vs_input(c->ureg, base + i);
1452    } else {
1453       input = c->input_index_map[base];
1454    }
1455 
1456    input = ntr_shift_by_frac(input, frac, num_components);
1457 
1458    switch (instr->intrinsic) {
1459    case nir_intrinsic_load_input:
1460       input = ntr_ureg_src_indirect(c, input, instr->src[0], 0);
1461       ntr_store(c, &instr->def, input);
1462       break;
1463 
1464    case nir_intrinsic_load_interpolated_input: {
1465       input = ntr_ureg_src_indirect(c, input, instr->src[1], 0);
1466 
1467       nir_intrinsic_instr *bary_instr =
1468          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1469 
1470       switch (bary_instr->intrinsic) {
1471       case nir_intrinsic_load_barycentric_pixel:
1472       case nir_intrinsic_load_barycentric_sample:
1473          /* For these, we know that the barycentric load matches the
1474           * interpolation on the input declaration, so we can use it directly.
1475           */
1476          ntr_store(c, &instr->def, input);
1477          break;
1478 
1479       case nir_intrinsic_load_barycentric_centroid:
1480          /* If the input was declared centroid, then there's no need to
1481           * emit the extra TGSI interp instruction, we can just read the
1482           * input.
1483           */
1484          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1485             ntr_store(c, &instr->def, input);
1486          } else {
1487             ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input);
1488          }
1489          break;
1490 
1491       case nir_intrinsic_load_barycentric_at_sample:
1492          /* We stored the sample in the fake "bary" dest. */
1493          ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input,
1494                             ntr_get_src(c, instr->src[0]));
1495          break;
1496 
1497       case nir_intrinsic_load_barycentric_at_offset:
1498          /* We stored the offset in the fake "bary" dest. */
1499          ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input,
1500                             ntr_get_src(c, instr->src[0]));
1501          break;
1502 
1503       default:
1504          unreachable("bad barycentric interp intrinsic\n");
1505       }
1506       break;
1507    }
1508 
1509    default:
1510       unreachable("bad load input intrinsic\n");
1511    }
1512 }
1513 
1514 static void
ntr_emit_store_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1515 ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1516 {
1517    struct ureg_src src = ntr_get_src(c, instr->src[0]);
1518 
1519    if (src.File == TGSI_FILE_OUTPUT) {
1520       /* If our src is the output file, that's an indication that we were able
1521        * to emit the output stores in the generating instructions and we have
1522        * nothing to do here.
1523        */
1524       return;
1525    }
1526 
1527    uint32_t frac;
1528    struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1529 
1530    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1531       out = ntr_ureg_dst_indirect(c, out, instr->src[2]);
1532       out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1533    } else {
1534       out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1535    }
1536 
1537    uint8_t swizzle[4] = { 0, 0, 0, 0 };
1538    for (int i = frac; i < 4; i++) {
1539       if (out.WriteMask & (1 << i))
1540          swizzle[i] = i - frac;
1541    }
1542 
1543    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1544 
1545    ntr_MOV(c, out, src);
1546 }
1547 
1548 static void
ntr_emit_load_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1549 ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1550 {
1551    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1552 
1553    /* ntr_try_store_in_tgsi_output() optimization is not valid if normal
1554     * load_output is present.
1555     */
1556    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
1557           (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
1558 
1559    uint32_t frac;
1560    struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1561 
1562    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
1563       out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1564       out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]);
1565    } else {
1566       out = ntr_ureg_dst_indirect(c, out, instr->src[0]);
1567    }
1568 
1569    struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1570    struct ureg_src out_src = ureg_src(out);
1571 
1572    /* Don't swizzling unavailable channels of the output in the writemasked-out
1573     * components. Avoids compile failures in virglrenderer with
1574     * TESS_LEVEL_INNER.
1575     */
1576    int fill_channel = ffs(dst.WriteMask) - 1;
1577    uint8_t swizzles[4] = { 0, 1, 2, 3 };
1578    for (int i = 0; i < 4; i++)
1579       if (!(dst.WriteMask & (1 << i)))
1580          swizzles[i] = fill_channel;
1581    out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
1582 
1583    if (semantics.fb_fetch_output)
1584       ntr_FBFETCH(c, dst, out_src);
1585    else
1586       ntr_MOV(c, dst, out_src);
1587 }
1588 
1589 static void
ntr_emit_load_sysval(struct ntr_compile * c,nir_intrinsic_instr * instr)1590 ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr)
1591 {
1592    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1593    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1594    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1595 
1596    /* virglrenderer doesn't like references to channels of the sysval that
1597     * aren't defined, even if they aren't really read.  (GLSL compile fails on
1598     * gl_NumWorkGroups.w, for example).
1599     */
1600    uint32_t write_mask = BITSET_MASK(instr->def.num_components);
1601    sv = ntr_swizzle_for_write_mask(sv, write_mask);
1602 
1603    /* TGSI and NIR define these intrinsics as always loading ints, but they can
1604     * still appear on hardware with non-native-integers fragment shaders using
1605     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
1606     * means that we actually want floats instead.
1607     */
1608    switch (instr->intrinsic) {
1609    case nir_intrinsic_load_vertex_id:
1610    case nir_intrinsic_load_instance_id:
1611       ntr_U2F(c, ntr_get_dest(c, &instr->def), sv);
1612       return;
1613 
1614    default:
1615       break;
1616    }
1617 
1618    ntr_store(c, &instr->def, sv);
1619 }
1620 
1621 static void
ntr_emit_intrinsic(struct ntr_compile * c,nir_intrinsic_instr * instr)1622 ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr)
1623 {
1624    switch (instr->intrinsic) {
1625    case nir_intrinsic_load_ubo:
1626    case nir_intrinsic_load_ubo_vec4:
1627       ntr_emit_load_ubo(c, instr);
1628       break;
1629 
1630       /* Vertex */
1631    case nir_intrinsic_load_draw_id:
1632    case nir_intrinsic_load_invocation_id:
1633    case nir_intrinsic_load_frag_coord:
1634    case nir_intrinsic_load_point_coord:
1635    case nir_intrinsic_load_front_face:
1636       ntr_emit_load_sysval(c, instr);
1637       break;
1638 
1639    case nir_intrinsic_load_input:
1640    case nir_intrinsic_load_per_vertex_input:
1641    case nir_intrinsic_load_interpolated_input:
1642       ntr_emit_load_input(c, instr);
1643       break;
1644 
1645    case nir_intrinsic_store_output:
1646    case nir_intrinsic_store_per_vertex_output:
1647       ntr_emit_store_output(c, instr);
1648       break;
1649 
1650    case nir_intrinsic_load_output:
1651    case nir_intrinsic_load_per_vertex_output:
1652       ntr_emit_load_output(c, instr);
1653       break;
1654 
1655    case nir_intrinsic_discard:
1656       ntr_KILL(c);
1657       break;
1658 
1659    case nir_intrinsic_discard_if: {
1660       struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0);
1661       /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1662       ntr_KILL_IF(c, ureg_negate(cond));
1663       break;
1664    }
1665       /* In TGSI we don't actually generate the barycentric coords, and emit
1666        * interp intrinsics later.  However, we do need to store the
1667        * load_barycentric_at_* argument so that we can use it at that point.
1668        */
1669    case nir_intrinsic_load_barycentric_pixel:
1670    case nir_intrinsic_load_barycentric_centroid:
1671    case nir_intrinsic_load_barycentric_sample:
1672       break;
1673    case nir_intrinsic_load_barycentric_at_sample:
1674    case nir_intrinsic_load_barycentric_at_offset:
1675       ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0]));
1676       break;
1677 
1678    case nir_intrinsic_decl_reg:
1679    case nir_intrinsic_load_reg:
1680    case nir_intrinsic_load_reg_indirect:
1681    case nir_intrinsic_store_reg:
1682    case nir_intrinsic_store_reg_indirect:
1683       /* fully consumed */
1684       break;
1685 
1686    default:
1687       fprintf(stderr, "Unknown intrinsic: ");
1688       nir_print_instr(&instr->instr, stderr);
1689       fprintf(stderr, "\n");
1690       break;
1691    }
1692 }
1693 
1694 struct ntr_tex_operand_state {
1695    struct ureg_src srcs[4];
1696    unsigned i;
1697 };
1698 
1699 static void
ntr_push_tex_arg(struct ntr_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_tex_operand_state * s)1700 ntr_push_tex_arg(struct ntr_compile *c,
1701                  nir_tex_instr *instr,
1702                  nir_tex_src_type tex_src_type,
1703                  struct ntr_tex_operand_state *s)
1704 {
1705    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1706    if (tex_src < 0)
1707       return;
1708 
1709    nir_src *src = &instr->src[tex_src].src;
1710    s->srcs[s->i++] = ntr_get_src(c, *src);
1711 }
1712 
1713 static void
ntr_emit_texture(struct ntr_compile * c,nir_tex_instr * instr)1714 ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr)
1715 {
1716    struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1717    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
1718    unsigned tex_opcode;
1719 
1720    int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
1721    int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
1722 
1723    struct ureg_src sampler;
1724    if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
1725       /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
1726       sampler = ntr_get_src(c, instr->src[tex_handle_src].src);
1727       assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
1728    } else {
1729       assert(tex_handle_src == -1 && sampler_handle_src == -1);
1730       sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1731       int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1732       if (sampler_src >= 0) {
1733          struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src);
1734          sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2));
1735       }
1736    }
1737 
1738    switch (instr->op) {
1739    case nir_texop_tex:
1740       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
1741          MAX2(instr->coord_components, 2) + instr->is_shadow)
1742          tex_opcode = TGSI_OPCODE_TXP;
1743       else
1744          tex_opcode = TGSI_OPCODE_TEX;
1745       break;
1746    case nir_texop_txl:
1747       tex_opcode = TGSI_OPCODE_TXL;
1748       break;
1749    case nir_texop_txb:
1750       tex_opcode = TGSI_OPCODE_TXB;
1751       break;
1752    case nir_texop_txd:
1753       tex_opcode = TGSI_OPCODE_TXD;
1754       break;
1755    case nir_texop_txs:
1756       tex_opcode = TGSI_OPCODE_TXQ;
1757       break;
1758    case nir_texop_tg4:
1759       tex_opcode = TGSI_OPCODE_TG4;
1760       break;
1761    case nir_texop_query_levels:
1762       tex_opcode = TGSI_OPCODE_TXQ;
1763       break;
1764    case nir_texop_lod:
1765       tex_opcode = TGSI_OPCODE_LODQ;
1766       break;
1767    case nir_texop_texture_samples:
1768       tex_opcode = TGSI_OPCODE_TXQS;
1769       break;
1770    default:
1771       unreachable("unsupported tex op");
1772    }
1773 
1774    struct ntr_tex_operand_state s = { .i = 0 };
1775    ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
1776    ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
1777 
1778    /* non-coord arg for TXQ */
1779    if (tex_opcode == TGSI_OPCODE_TXQ) {
1780       ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1781       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
1782        * scalar
1783        */
1784       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
1785    }
1786 
1787    if (s.i > 1) {
1788       if (tex_opcode == TGSI_OPCODE_TEX)
1789          tex_opcode = TGSI_OPCODE_TEX2;
1790       if (tex_opcode == TGSI_OPCODE_TXB)
1791          tex_opcode = TGSI_OPCODE_TXB2;
1792       if (tex_opcode == TGSI_OPCODE_TXL)
1793          tex_opcode = TGSI_OPCODE_TXL2;
1794    }
1795 
1796    if (instr->op == nir_texop_txd) {
1797       /* Derivs appear in their own src args */
1798       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
1799       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1800       s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src);
1801       s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src);
1802    }
1803 
1804    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1805       if (c->screen->get_param(c->screen,
1806                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
1807          sampler = ureg_scalar(sampler, instr->component);
1808          s.srcs[s.i++] = ureg_src_undef();
1809       } else {
1810          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
1811       }
1812    }
1813 
1814    s.srcs[s.i++] = sampler;
1815 
1816    enum tgsi_return_type tex_type;
1817    switch (instr->dest_type) {
1818    case nir_type_float32:
1819       tex_type = TGSI_RETURN_TYPE_FLOAT;
1820       break;
1821    case nir_type_int32:
1822       tex_type = TGSI_RETURN_TYPE_SINT;
1823       break;
1824    case nir_type_uint32:
1825       tex_type = TGSI_RETURN_TYPE_UINT;
1826       break;
1827    default:
1828       unreachable("unknown texture type");
1829    }
1830 
1831    struct ureg_dst tex_dst;
1832    if (instr->op == nir_texop_query_levels)
1833       tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W);
1834    else
1835       tex_dst = dst;
1836 
1837    while (s.i < 4)
1838       s.srcs[s.i++] = ureg_src_undef();
1839 
1840    struct ntr_insn *insn = ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
1841    insn->tex_target = target;
1842    insn->tex_return_type = tex_type;
1843    insn->is_tex = true;
1844 
1845    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
1846    if (tex_offset_src >= 0) {
1847       struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src);
1848 
1849       insn->tex_offset[0].File = offset.File;
1850       insn->tex_offset[0].Index = offset.Index;
1851       insn->tex_offset[0].SwizzleX = offset.SwizzleX;
1852       insn->tex_offset[0].SwizzleY = offset.SwizzleY;
1853       insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
1854       insn->tex_offset[0].Padding = 0;
1855    }
1856 
1857    if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
1858       for (uint8_t i = 0; i < 4; ++i) {
1859          struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
1860          insn->tex_offset[i].File = imm.File;
1861          insn->tex_offset[i].Index = imm.Index;
1862          insn->tex_offset[i].SwizzleX = imm.SwizzleX;
1863          insn->tex_offset[i].SwizzleY = imm.SwizzleY;
1864          insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
1865       }
1866    }
1867 
1868    if (instr->op == nir_texop_query_levels)
1869       ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
1870 }
1871 
1872 static void
ntr_emit_jump(struct ntr_compile * c,nir_jump_instr * jump)1873 ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump)
1874 {
1875    switch (jump->type) {
1876    case nir_jump_break:
1877       ntr_BRK(c);
1878       break;
1879 
1880    case nir_jump_continue:
1881       ntr_CONT(c);
1882       break;
1883 
1884    default:
1885       fprintf(stderr, "Unknown jump instruction: ");
1886       nir_print_instr(&jump->instr, stderr);
1887       fprintf(stderr, "\n");
1888       abort();
1889    }
1890 }
1891 
1892 static void
ntr_emit_ssa_undef(struct ntr_compile * c,nir_undef_instr * instr)1893 ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr)
1894 {
1895    /* Nothing to do but make sure that we have some storage to deref. */
1896    (void)ntr_get_ssa_def_decl(c, &instr->def);
1897 }
1898 
1899 static void
ntr_emit_instr(struct ntr_compile * c,nir_instr * instr)1900 ntr_emit_instr(struct ntr_compile *c, nir_instr *instr)
1901 {
1902    switch (instr->type) {
1903    case nir_instr_type_deref:
1904       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
1905       break;
1906 
1907    case nir_instr_type_alu:
1908       ntr_emit_alu(c, nir_instr_as_alu(instr));
1909       break;
1910 
1911    case nir_instr_type_intrinsic:
1912       ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1913       break;
1914 
1915    case nir_instr_type_load_const:
1916       /* Nothing to do here, as load consts are done directly from
1917        * ntr_get_src() (since many constant NIR srcs will often get folded
1918        * directly into a register file index instead of as a TGSI src).
1919        */
1920       break;
1921 
1922    case nir_instr_type_tex:
1923       ntr_emit_texture(c, nir_instr_as_tex(instr));
1924       break;
1925 
1926    case nir_instr_type_jump:
1927       ntr_emit_jump(c, nir_instr_as_jump(instr));
1928       break;
1929 
1930    case nir_instr_type_undef:
1931       ntr_emit_ssa_undef(c, nir_instr_as_undef(instr));
1932       break;
1933 
1934    default:
1935       fprintf(stderr, "Unknown NIR instr type: ");
1936       nir_print_instr(instr, stderr);
1937       fprintf(stderr, "\n");
1938       abort();
1939    }
1940 }
1941 
1942 static void
ntr_emit_if(struct ntr_compile * c,nir_if * if_stmt)1943 ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt)
1944 {
1945    ntr_IF(c, c->if_cond);
1946 
1947    ntr_emit_cf_list(c, &if_stmt->then_list);
1948 
1949    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
1950       ntr_ELSE(c);
1951       ntr_emit_cf_list(c, &if_stmt->else_list);
1952    }
1953 
1954    ntr_ENDIF(c);
1955 }
1956 
1957 static void
ntr_emit_loop(struct ntr_compile * c,nir_loop * loop)1958 ntr_emit_loop(struct ntr_compile *c, nir_loop *loop)
1959 {
1960    assert(!nir_loop_has_continue_construct(loop));
1961    ntr_BGNLOOP(c);
1962    ntr_emit_cf_list(c, &loop->body);
1963    ntr_ENDLOOP(c);
1964 }
1965 
1966 static void
ntr_emit_block(struct ntr_compile * c,nir_block * block)1967 ntr_emit_block(struct ntr_compile *c, nir_block *block)
1968 {
1969    struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
1970    c->cur_block = ntr_block;
1971 
1972    nir_foreach_instr(instr, block) {
1973       ntr_emit_instr(c, instr);
1974 
1975       /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */
1976       if (ureg_get_instruction_number(c->ureg) != 0) {
1977          fprintf(stderr, "Emitted ureg insn during: ");
1978          nir_print_instr(instr, stderr);
1979          fprintf(stderr, "\n");
1980          unreachable("emitted ureg insn");
1981       }
1982    }
1983 
1984    /* Set up the if condition for ntr_emit_if(), which we have to do before
1985     * freeing up the temps (the "if" is treated as inside the block for liveness
1986     * purposes, despite not being an instruction)
1987     *
1988     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
1989     * looks at all of .xyzw.  No harm in working around the bug.
1990     */
1991    nir_if *nif = nir_block_get_following_if(block);
1992    if (nif)
1993       c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X);
1994 }
1995 
1996 static void
ntr_emit_cf_list(struct ntr_compile * c,struct exec_list * list)1997 ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list)
1998 {
1999    foreach_list_typed(nir_cf_node, node, node, list) {
2000       switch (node->type) {
2001       case nir_cf_node_block:
2002          ntr_emit_block(c, nir_cf_node_as_block(node));
2003          break;
2004 
2005       case nir_cf_node_if:
2006          ntr_emit_if(c, nir_cf_node_as_if(node));
2007          break;
2008 
2009       case nir_cf_node_loop:
2010          ntr_emit_loop(c, nir_cf_node_as_loop(node));
2011          break;
2012 
2013       default:
2014          unreachable("unknown CF type");
2015       }
2016    }
2017 }
2018 
2019 static void
ntr_emit_block_ureg(struct ntr_compile * c,struct nir_block * block)2020 ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block)
2021 {
2022    struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
2023 
2024    /* Emit the ntr insns to tgsi_ureg. */
2025    util_dynarray_foreach(&ntr_block->insns, struct ntr_insn, insn) {
2026       const struct tgsi_opcode_info *opcode_info =
2027          tgsi_get_opcode_info(insn->opcode);
2028 
2029       switch (insn->opcode) {
2030       case TGSI_OPCODE_IF:
2031          ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2032          break;
2033 
2034       case TGSI_OPCODE_ELSE:
2035          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2036          ureg_ELSE(c->ureg, &c->cf_label);
2037          c->current_if_else = c->cf_label;
2038          break;
2039 
2040       case TGSI_OPCODE_ENDIF:
2041          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2042          ureg_ENDIF(c->ureg);
2043          break;
2044 
2045       case TGSI_OPCODE_BGNLOOP:
2046          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2047           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2048           * with a need.
2049           */
2050          ureg_BGNLOOP(c->ureg, &c->cf_label);
2051          break;
2052 
2053       case TGSI_OPCODE_ENDLOOP:
2054          ureg_ENDLOOP(c->ureg, &c->cf_label);
2055          break;
2056 
2057       default:
2058          if (insn->is_tex) {
2059             int num_offsets = 0;
2060             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
2061                if (insn->tex_offset[i].File != TGSI_FILE_NULL)
2062                   num_offsets = i + 1;
2063             }
2064             ureg_tex_insn(c->ureg, insn->opcode,
2065                           insn->dst, opcode_info->num_dst,
2066                           insn->tex_target, insn->tex_return_type,
2067                           insn->tex_offset,
2068                           num_offsets,
2069                           insn->src, opcode_info->num_src);
2070          } else {
2071             ureg_insn(c->ureg, insn->opcode,
2072                      insn->dst, opcode_info->num_dst,
2073                      insn->src, opcode_info->num_src,
2074                      insn->precise);
2075          }
2076       }
2077    }
2078 }
2079 
2080 static void
ntr_emit_if_ureg(struct ntr_compile * c,nir_if * if_stmt)2081 ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt)
2082 {
2083    /* Note: the last block emitted our IF opcode. */
2084 
2085    int if_stack = c->current_if_else;
2086    c->current_if_else = c->cf_label;
2087 
2088    /* Either the then or else block includes the ENDIF, which will fix up the
2089     * IF(/ELSE)'s label for jumping
2090     */
2091    ntr_emit_cf_list_ureg(c, &if_stmt->then_list);
2092    ntr_emit_cf_list_ureg(c, &if_stmt->else_list);
2093 
2094    c->current_if_else = if_stack;
2095 }
2096 
2097 static void
ntr_emit_cf_list_ureg(struct ntr_compile * c,struct exec_list * list)2098 ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list)
2099 {
2100    foreach_list_typed(nir_cf_node, node, node, list) {
2101       switch (node->type) {
2102       case nir_cf_node_block:
2103          ntr_emit_block_ureg(c, nir_cf_node_as_block(node));
2104          break;
2105 
2106       case nir_cf_node_if:
2107          ntr_emit_if_ureg(c, nir_cf_node_as_if(node));
2108          break;
2109 
2110       case nir_cf_node_loop:
2111          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2112           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2113           * with a need.
2114           */
2115          ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
2116          break;
2117 
2118       default:
2119          unreachable("unknown CF type");
2120       }
2121    }
2122 }
2123 
2124 static void
ntr_emit_impl(struct ntr_compile * c,nir_function_impl * impl)2125 ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl)
2126 {
2127    c->impl = impl;
2128 
2129    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
2130    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
2131 
2132    /* Set up the struct ntr_blocks to put insns in */
2133    c->blocks = _mesa_pointer_hash_table_create(c);
2134    nir_foreach_block(block, impl) {
2135       struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block);
2136       util_dynarray_init(&ntr_block->insns, ntr_block);
2137       _mesa_hash_table_insert(c->blocks, block, ntr_block);
2138    }
2139 
2140 
2141    ntr_setup_registers(c);
2142 
2143    c->cur_block = ntr_block_from_nir(c, nir_start_block(impl));
2144    ntr_setup_inputs(c);
2145    ntr_setup_outputs(c);
2146    ntr_setup_uniforms(c);
2147 
2148    /* Emit the ntr insns */
2149    ntr_emit_cf_list(c, &impl->body);
2150 
2151    /* Don't do optimized RA if the driver requests it, unless the number of
2152     * temps is too large to be covered by the 16 bit signed int that TGSI
2153     * allocates for the register index */
2154    if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
2155       ntr_allocate_regs(c, impl);
2156    else
2157       ntr_allocate_regs_unoptimized(c, impl);
2158 
2159    /* Turn the ntr insns into actual TGSI tokens */
2160    ntr_emit_cf_list_ureg(c, &impl->body);
2161 
2162    ralloc_free(c->liveness);
2163    c->liveness = NULL;
2164 
2165 }
2166 
2167 static int
type_size(const struct glsl_type * type,bool bindless)2168 type_size(const struct glsl_type *type, bool bindless)
2169 {
2170    return glsl_count_attribute_slots(type, false);
2171 }
2172 
2173 /* Allow vectorizing of ALU instructions.
2174  */
2175 static uint8_t
ntr_should_vectorize_instr(const nir_instr * instr,const void * data)2176 ntr_should_vectorize_instr(const nir_instr *instr, const void *data)
2177 {
2178    if (instr->type != nir_instr_type_alu)
2179       return 0;
2180 
2181    return 4;
2182 }
2183 
2184 static bool
ntr_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2185 ntr_should_vectorize_io(unsigned align, unsigned bit_size,
2186                         unsigned num_components, unsigned high_offset,
2187                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
2188                         void *data)
2189 {
2190    if (bit_size != 32)
2191       return false;
2192 
2193    /* Our offset alignment should aways be at least 4 bytes */
2194    if (align < 4)
2195       return false;
2196 
2197    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
2198     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
2199     * need this restriction.
2200     */
2201    unsigned worst_start_component = align == 4 ? 3 : align / 4;
2202    if (worst_start_component + num_components > 4)
2203       return false;
2204 
2205    return true;
2206 }
2207 
2208 static nir_variable_mode
ntr_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2209 ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2210 {
2211    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2212    unsigned indirect_mask = 0;
2213 
2214    if (!screen->get_shader_param(screen, pipe_stage,
2215                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2216       indirect_mask |= nir_var_shader_in;
2217    }
2218 
2219    if (!screen->get_shader_param(screen, pipe_stage,
2220                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2221       indirect_mask |= nir_var_shader_out;
2222    }
2223 
2224    if (!screen->get_shader_param(screen, pipe_stage,
2225                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2226       indirect_mask |= nir_var_function_temp;
2227    }
2228 
2229    return indirect_mask;
2230 }
2231 
2232 struct ntr_lower_tex_state {
2233    nir_scalar channels[8];
2234    unsigned i;
2235 };
2236 
2237 static void
nir_to_rc_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_lower_tex_state * s)2238 nir_to_rc_lower_tex_instr_arg(nir_builder *b,
2239                                 nir_tex_instr *instr,
2240                                 nir_tex_src_type tex_src_type,
2241                                 struct ntr_lower_tex_state *s)
2242 {
2243    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2244    if (tex_src < 0)
2245       return;
2246 
2247    nir_def *def = instr->src[tex_src].src.ssa;
2248    for (int i = 0; i < def->num_components; i++) {
2249       s->channels[s->i++] = nir_get_scalar(def, i);
2250    }
2251 
2252    nir_tex_instr_remove_src(instr, tex_src);
2253 }
2254 
2255 /**
2256  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
2257  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
2258  * manage it on our own, and may lead to more vectorization.
2259  */
2260 static bool
nir_to_rc_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)2261 nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
2262 {
2263    if (instr->type != nir_instr_type_tex)
2264       return false;
2265 
2266    nir_tex_instr *tex = nir_instr_as_tex(instr);
2267 
2268    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
2269       return false;
2270 
2271    b->cursor = nir_before_instr(instr);
2272 
2273    struct ntr_lower_tex_state s = {0};
2274 
2275    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
2276    /* We always have at least two slots for the coordinate, even on 1D. */
2277    s.i = MAX2(s.i, 2);
2278 
2279    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
2280    s.i = MAX2(s.i, 3);
2281 
2282    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
2283 
2284    /* XXX: LZ */
2285    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
2286    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
2287    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
2288 
2289    /* No need to pack undefs in unused channels of the tex instr */
2290    while (!s.channels[s.i - 1].def)
2291       s.i--;
2292 
2293    /* Instead of putting undefs in the unused slots of the vecs, just put in
2294     * another used channel.  Otherwise, we'll get unnecessary moves into
2295     * registers.
2296     */
2297    assert(s.channels[0].def != NULL);
2298    for (int i = 1; i < s.i; i++) {
2299       if (!s.channels[i].def)
2300          s.channels[i] = s.channels[0];
2301    }
2302 
2303    nir_tex_instr_add_src(tex, nir_tex_src_backend1,
2304                          nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
2305    if (s.i > 4)
2306       nir_tex_instr_add_src(tex, nir_tex_src_backend2,
2307                             nir_vec_scalars(b, &s.channels[4], s.i - 4));
2308 
2309    return true;
2310 }
2311 
2312 static bool
nir_to_rc_lower_tex(nir_shader * s)2313 nir_to_rc_lower_tex(nir_shader *s)
2314 {
2315    return nir_shader_instructions_pass(s,
2316                                        nir_to_rc_lower_tex_instr,
2317                                        nir_metadata_block_index |
2318                                        nir_metadata_dominance,
2319                                        NULL);
2320 }
2321 
2322 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
2323 static void
nir_to_rc_lower_txp(nir_shader * s)2324 nir_to_rc_lower_txp(nir_shader *s)
2325 {
2326    nir_lower_tex_options lower_tex_options = {
2327        .lower_txp = 0,
2328    };
2329 
2330    nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
2331       nir_foreach_instr(instr, block) {
2332          if (instr->type != nir_instr_type_tex)
2333             continue;
2334          nir_tex_instr *tex = nir_instr_as_tex(instr);
2335 
2336          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
2337             continue;
2338 
2339          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
2340          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
2341          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
2342 
2343          /* We can do TXP for any tex (not txg) where we can fit all the
2344           * coordinates and comparator and projector in one vec4 without any
2345           * other modifiers to add on.
2346           *
2347           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
2348           * if we get any funny projectors then we just blow them all away.
2349           */
2350          if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
2351             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
2352       }
2353    }
2354 
2355    /* nir_lower_tex must be run even if no options are set, because we need the
2356     * LOD to be set for query_levels and for non-fragment shaders.
2357     */
2358    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
2359 }
2360 
2361 const void *
nir_to_rc(struct nir_shader * s,struct pipe_screen * screen)2362 nir_to_rc(struct nir_shader *s,
2363             struct pipe_screen *screen)
2364 {
2365    static const struct nir_to_rc_options default_ntr_options = {0};
2366    return nir_to_rc_options(s, screen, &default_ntr_options);
2367 }
2368 
2369 /**
2370  * Translates the NIR shader to TGSI.
2371  *
2372  * This requires some lowering of the NIR shader to prepare it for translation.
2373  * We take ownership of the NIR shader passed, returning a reference to the new
2374  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
2375  */
nir_to_rc_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_rc_options * options)2376 const void *nir_to_rc_options(struct nir_shader *s,
2377                                 struct pipe_screen *screen,
2378                                 const struct nir_to_rc_options *options)
2379 {
2380    struct ntr_compile *c;
2381    const void *tgsi_tokens;
2382    bool is_r500 = r300_screen(screen)->caps.is_r500;
2383    nir_variable_mode no_indirects_mask = ntr_no_indirects_mask(s, screen);
2384 
2385    /* Lower array indexing on FS inputs.  Since we don't set
2386     * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
2387     * elements by ureg, and so dynamically indexing them would be invalid.
2388     * Ideally we would set that ureg flag based on
2389     * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
2390     * splitting NIR VS outputs to elements even if the FS doesn't get the
2391     * corresponding splitting, and virgl depends on TGSI across link boundaries
2392     * having matching declarations.
2393     */
2394    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2395       NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
2396       NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
2397    }
2398 
2399    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2400               type_size, (nir_lower_io_options)0);
2401 
2402    nir_to_rc_lower_txp(s);
2403    NIR_PASS_V(s, nir_to_rc_lower_tex);
2404 
2405    if (!s->options->lower_uniforms_to_ubo) {
2406       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
2407                  screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
2408                  true);
2409    }
2410 
2411    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
2412       NIR_PASS_V(s, nir_lower_ubo_vec4);
2413 
2414    bool progress;
2415    NIR_PASS_V(s, nir_opt_constant_folding);
2416 
2417    /* Clean up after triginometric input normalization. */
2418    NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
2419    do {
2420       progress = false;
2421       NIR_PASS(progress, s, nir_opt_shrink_vectors);
2422    } while (progress);
2423    NIR_PASS_V(s, nir_copy_prop);
2424    NIR_PASS_V(s, nir_opt_cse);
2425    NIR_PASS_V(s, nir_opt_dce);
2426    NIR_PASS_V(s, nir_opt_shrink_stores, true);
2427 
2428    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
2429 
2430    /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
2431    NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
2432 
2433    NIR_PASS_V(s, nir_lower_frexp);
2434 
2435    do {
2436       progress = false;
2437       NIR_PASS(progress, s, nir_opt_algebraic_late);
2438       if (progress) {
2439          NIR_PASS_V(s, nir_copy_prop);
2440          NIR_PASS_V(s, nir_opt_dce);
2441          NIR_PASS_V(s, nir_opt_cse);
2442       }
2443    } while (progress);
2444 
2445    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2446       NIR_PASS_V(s, r300_nir_prepare_presubtract);
2447    }
2448 
2449    NIR_PASS_V(s, nir_lower_int_to_float);
2450    NIR_PASS_V(s, nir_copy_prop);
2451    NIR_PASS_V(s, r300_nir_post_integer_lowering);
2452    NIR_PASS_V(s, nir_lower_bool_to_float,
2453               !options->lower_cmp && !options->lower_fabs);
2454    /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
2455    NIR_PASS_V(s, nir_copy_prop);
2456    /* CSE cleanup after late ftrunc lowering. */
2457    NIR_PASS_V(s, nir_opt_cse);
2458    /* At this point we need to clean;
2459     *  a) fcsel_gt that come from the ftrunc lowering on R300,
2460     *  b) all flavours of fcsels that read three different temp sources on R500.
2461     */
2462    if (s->info.stage == MESA_SHADER_VERTEX) {
2463       if (is_r500)
2464          NIR_PASS_V(s, r300_nir_lower_fcsel_r500);
2465       else
2466          NIR_PASS_V(s, r300_nir_lower_fcsel_r300);
2467       NIR_PASS_V(s, r300_nir_lower_flrp);
2468    } else {
2469       NIR_PASS_V(s, r300_nir_lower_comparison_fs);
2470    }
2471    NIR_PASS_V(s, r300_nir_opt_algebraic_late);
2472    NIR_PASS_V(s, nir_opt_dce);
2473 
2474    nir_move_options move_all =
2475        nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
2476        nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
2477 
2478    NIR_PASS_V(s, nir_opt_move, move_all);
2479    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true);
2480    /* Late vectorizing after nir_move_vec_src_uses_to_dest helps instructions but
2481     * increases register usage. Testing shows this is beneficial only in VS.
2482     */
2483    if (s->info.stage == MESA_SHADER_VERTEX)
2484       NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
2485 
2486    NIR_PASS_V(s, nir_convert_from_ssa, true);
2487    NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL);
2488 
2489    /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
2490     */
2491    NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
2492    NIR_PASS_V(s, nir_opt_dce);
2493 
2494    /* See comment in ntr_get_alu_src for supported modifiers */
2495    NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
2496 
2497    if (NIR_DEBUG(TGSI)) {
2498       fprintf(stderr, "NIR before translation to TGSI:\n");
2499       nir_print_shader(s, stderr);
2500    }
2501 
2502    c = rzalloc(NULL, struct ntr_compile);
2503    c->screen = screen;
2504    c->options = options;
2505 
2506    c->s = s;
2507    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2508    ureg_setup_shader_info(c->ureg, &s->info);
2509    if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
2510       ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
2511 
2512    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2513       /* The draw module's polygon stipple layer doesn't respect the chosen
2514        * coordinate mode, so leave it as unspecified unless we're actually
2515        * reading the position in the shader already.  See
2516        * gl-2.1-polygon-stipple-fs on softpipe.
2517        */
2518       if ((s->info.inputs_read & VARYING_BIT_POS) ||
2519           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2520          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2521                        s->info.fs.origin_upper_left ?
2522                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
2523                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2524 
2525          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2526                        s->info.fs.pixel_center_integer ?
2527                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
2528                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2529       }
2530    }
2531    /* Emit the main function */
2532    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2533    ntr_emit_impl(c, impl);
2534    ureg_END(c->ureg);
2535 
2536    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2537 
2538    if (NIR_DEBUG(TGSI)) {
2539       fprintf(stderr, "TGSI after translation from NIR:\n");
2540       tgsi_dump(tgsi_tokens, 0);
2541    }
2542 
2543    ureg_destroy(c->ureg);
2544 
2545    ralloc_free(c);
2546    ralloc_free(s);
2547 
2548    return tgsi_tokens;
2549 }
2550