• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_legacy.h"
27 #include "compiler/nir/nir_worklist.h"
28 #include "nir/nir_to_tgsi.h"
29 #include "pipe/p_screen.h"
30 #include "pipe/p_state.h"
31 #include "tgsi/tgsi_dump.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_ureg.h"
36 #include "tgsi/tgsi_util.h"
37 #include "util/u_debug.h"
38 #include "util/u_math.h"
39 #include "util/u_memory.h"
40 #include "util/u_dynarray.h"
41 
42 struct ntt_insn {
43    enum tgsi_opcode opcode;
44    struct ureg_dst dst[2];
45    struct ureg_src src[4];
46    enum tgsi_texture_type tex_target;
47    enum tgsi_return_type tex_return_type;
48    struct tgsi_texture_offset tex_offset[4];
49 
50    unsigned mem_qualifier;
51    enum pipe_format mem_format;
52 
53    bool is_tex : 1;
54    bool is_mem : 1;
55    bool precise : 1;
56 };
57 
58 struct ntt_block {
59    /* Array of struct ntt_insn */
60    struct util_dynarray insns;
61    int start_ip;
62    int end_ip;
63 };
64 
65 struct ntt_reg_interval {
66    uint32_t start, end;
67 };
68 
69 struct ntt_compile {
70    nir_shader *s;
71    nir_function_impl *impl;
72    const struct nir_to_tgsi_options *options;
73    struct pipe_screen *screen;
74    struct ureg_program *ureg;
75 
76    bool needs_texcoord_semantic;
77    bool native_integers;
78    bool has_txf_lz;
79 
80    bool addr_declared[3];
81    struct ureg_dst addr_reg[3];
82 
83    /* if condition set up at the end of a block, for ntt_emit_if(). */
84    struct ureg_src if_cond;
85 
86    /* if condition set up at the end of a block, for ntt_emit_if(). */
87    struct ureg_src if_cond;
88 
89    /* TGSI temps for our NIR SSA and register values. */
90    struct ureg_dst *reg_temp;
91    struct ureg_src *ssa_temp;
92 
93    struct ntt_reg_interval *liveness;
94 
95    /* Map from nir_block to ntt_block */
96    struct hash_table *blocks;
97    struct ntt_block *cur_block;
98    unsigned current_if_else;
99    unsigned cf_label;
100 
101    /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
102    bool precise;
103 
104    unsigned num_temps;
105    unsigned first_non_array_temp;
106 
107    /* Mappings from driver_location to TGSI input/output number.
108     *
109     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
110     * their numbers assigned incrementally, unlike inputs or constants.
111     */
112    struct ureg_src *input_index_map;
113    uint64_t centroid_inputs;
114 
115    uint32_t first_ubo;
116    uint32_t first_ssbo;
117 
118    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
119 };
120 
121 static struct ureg_dst
ntt_temp(struct ntt_compile * c)122 ntt_temp(struct ntt_compile *c)
123 {
124    return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
125 }
126 
127 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)128 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
129 {
130    struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
131    return entry->data;
132 }
133 
134 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
135 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
136 
137 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)138 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
139          struct ureg_dst dst,
140          struct ureg_src src0, struct ureg_src src1,
141          struct ureg_src src2, struct ureg_src src3)
142 {
143    struct ntt_insn insn = {
144       .opcode = opcode,
145       .dst = { dst, ureg_dst_undef() },
146       .src = { src0, src1, src2, src3 },
147       .precise = c->precise,
148    };
149    util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
150    return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
151 }
152 
153 #define OP00( op )                                                                     \
154 static inline void ntt_##op(struct ntt_compile *c)                                     \
155 {                                                                                      \
156    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
157 }
158 
159 #define OP01( op )                                                                     \
160 static inline void ntt_##op(struct ntt_compile *c,                                     \
161                      struct ureg_src src0)                                             \
162 {                                                                                      \
163    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
164 }
165 
166 
167 #define OP10( op )                                                                     \
168 static inline void ntt_##op(struct ntt_compile *c,                                     \
169                      struct ureg_dst dst)                                              \
170 {                                                                                      \
171    ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
172 }
173 
174 #define OP11( op )                                                                     \
175 static inline void ntt_##op(struct ntt_compile *c,                                     \
176                      struct ureg_dst dst,                                              \
177                      struct ureg_src src0)                                             \
178 {                                                                                      \
179    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
180 }
181 
182 #define OP12( op )                                                                     \
183 static inline void ntt_##op(struct ntt_compile *c,                                     \
184                      struct ureg_dst dst,                                              \
185                      struct ureg_src src0,                                             \
186                      struct ureg_src src1)                                             \
187 {                                                                                      \
188    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
189 }
190 
191 #define OP13( op )                                                                     \
192 static inline void ntt_##op(struct ntt_compile *c,                                     \
193                      struct ureg_dst dst,                                              \
194                      struct ureg_src src0,                                             \
195                      struct ureg_src src1,                                             \
196                      struct ureg_src src2)                                             \
197 {                                                                                      \
198    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());             \
199 }
200 
201 #define OP14( op )                                                                     \
202 static inline void ntt_##op(struct ntt_compile *c,                                     \
203                      struct ureg_dst dst,                                              \
204                      struct ureg_src src0,                                             \
205                      struct ureg_src src1,                                             \
206                      struct ureg_src src2,                                             \
207                      struct ureg_src src3)                                             \
208 {                                                                                      \
209    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                         \
210 }
211 
212 /* We hand-craft our tex instructions */
213 #define OP12_TEX(op)
214 #define OP14_TEX(op)
215 
216 /* Use a template include to generate a correctly-typed ntt_OP()
217  * function for each TGSI opcode:
218  */
219 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
220 
221 /**
222  * Interprets a nir_load_const used as a NIR src as a uint.
223  *
224  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
225  * instruction (or in a phi-web used by an integer ALU instruction) were
226  * converted to floats and the ALU instruction swapped to the float equivalent.
227  * However, this means that integer load_consts used by intrinsics (which don't
228  * normally get that conversion) may have been reformatted to be floats.  Given
229  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
230  * we can just look and see if they look like floats and convert them back to
231  * ints.
232  */
233 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)234 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
235 {
236    uint32_t val = nir_src_as_uint(src);
237    if (!c->native_integers && val >= fui(1.0))
238       val = (uint32_t)uif(val);
239    return val;
240 }
241 
242 static unsigned
ntt_64bit_write_mask(unsigned write_mask)243 ntt_64bit_write_mask(unsigned write_mask)
244 {
245    return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
246 }
247 
248 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)249 ntt_64bit_1f(struct ntt_compile *c)
250 {
251    return ureg_imm4u(c->ureg,
252                      0x00000000, 0x3ff00000,
253                      0x00000000, 0x3ff00000);
254 }
255 
256 /* Per-channel masks of def/use within the block, and the per-channel
257  * livein/liveout for the block as a whole.
258  */
259 struct ntt_live_reg_block_state {
260    uint8_t *def, *use, *livein, *liveout, *defin, *defout;
261 };
262 
263 struct ntt_live_reg_state {
264    unsigned bitset_words;
265 
266    struct ntt_reg_interval *regs;
267 
268    /* Used in propagate_across_edge() */
269    BITSET_WORD *tmp_live;
270 
271    struct ntt_live_reg_block_state *blocks;
272 
273    nir_block_worklist worklist;
274 };
275 
276 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)277 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
278                       int ip, unsigned index, unsigned used_mask)
279 {
280    bs->use[index] |= used_mask & ~bs->def[index];
281 
282    c->liveness[index].start = MIN2(c->liveness[index].start, ip);
283    c->liveness[index].end = MAX2(c->liveness[index].end, ip);
284 
285 }
286 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)287 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
288 {
289    for (int i = 0; i < impl->num_blocks; i++) {
290       state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291       state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
292       state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
293       state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
294       state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
295       state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
296    }
297 
298    int ip = 0;
299    nir_foreach_block(block, impl) {
300       struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
301       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
302 
303       ntt_block->start_ip = ip;
304 
305       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
306          const struct tgsi_opcode_info *opcode_info =
307             tgsi_get_opcode_info(insn->opcode);
308 
309          /* Set up use[] for the srcs.
310           *
311           * Uses are the channels of the reg read in the block that don't have a
312           * preceding def to screen them off.  Note that we don't do per-element
313           * tracking of array regs, so they're never screened off.
314           */
315          for (int i = 0; i < opcode_info->num_src; i++) {
316             if (insn->src[i].File != TGSI_FILE_TEMPORARY)
317                continue;
318             int index = insn->src[i].Index;
319 
320             uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
321                                                               insn->dst->WriteMask,
322                                                               insn->src[i].SwizzleX,
323                                                               insn->src[i].SwizzleY,
324                                                               insn->src[i].SwizzleZ,
325                                                               insn->src[i].SwizzleW,
326                                                               insn->tex_target,
327                                                               insn->tex_target);
328 
329             assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
330             ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
331          }
332 
333          if (insn->is_tex) {
334             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
335                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY)
336                   ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset[i].Index, 0xf);
337             }
338          }
339 
340          /* Set up def[] for the srcs.
341           *
342           * Defs are the unconditionally-written (not R/M/W) channels of the reg in
343           * the block that don't have a preceding use.
344           */
345          for (int i = 0; i < opcode_info->num_dst; i++) {
346             if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
347                continue;
348             int index = insn->dst[i].Index;
349             uint32_t writemask = insn->dst[i].WriteMask;
350 
351             bs->def[index] |= writemask & ~bs->use[index];
352             bs->defout[index] |= writemask;
353 
354             assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
355             c->liveness[index].start = MIN2(c->liveness[index].start, ip);
356             c->liveness[index].end = MAX2(c->liveness[index].end, ip);
357          }
358          ip++;
359       }
360 
361       ntt_block->end_ip = ip;
362    }
363 }
364 
365 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)366 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
367 {
368    nir_metadata_require(impl, nir_metadata_block_index);
369 
370    c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
371 
372    struct ntt_live_reg_state state = {
373        .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
374    };
375 
376    /* The intervals start out with start > end (indicating unused) */
377    for (int i = 0; i < c->num_temps; i++)
378       c->liveness[i].start = ~0;
379 
380    ntt_live_reg_setup_def_use(c, impl, &state);
381 
382    /* Make a forward-order worklist of all the blocks. */
383    nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
384    nir_foreach_block(block, impl) {
385       nir_block_worklist_push_tail(&state.worklist, block);
386    }
387 
388    /* Propagate defin/defout down the CFG to calculate the live variables
389     * potentially defined along any possible control flow path.  We'll use this
390     * to keep things like conditional defs of the reg (or array regs where we
391     * don't track defs!) from making the reg's live range extend back to the
392     * start of the program.
393     */
394    while (!nir_block_worklist_is_empty(&state.worklist)) {
395       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
396       for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
397          nir_block *succ = block->successors[j];
398          if (!succ || succ->index == impl->num_blocks)
399             continue;
400 
401          for (int i = 0; i < c->num_temps; i++) {
402             uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
403 
404             if (new_def) {
405                state.blocks[succ->index].defin[i] |= new_def;
406                state.blocks[succ->index].defout[i] |= new_def;
407                nir_block_worklist_push_tail(&state.worklist, succ);
408             }
409          }
410       }
411    }
412 
413    /* Make a reverse-order worklist of all the blocks. */
414    nir_foreach_block(block, impl) {
415       nir_block_worklist_push_head(&state.worklist, block);
416    }
417 
418    /* We're now ready to work through the worklist and update the liveness sets
419     * of each of the blocks.  As long as we keep the worklist up-to-date as we
420     * go, everything will get covered.
421     */
422    while (!nir_block_worklist_is_empty(&state.worklist)) {
423       /* We pop them off in the reverse order we pushed them on.  This way
424        * the first walk of the instructions is backwards so we only walk
425        * once in the case of no control flow.
426        */
427       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
428       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
429       struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
430 
431       for (int i = 0; i < c->num_temps; i++) {
432          /* Collect livein from our successors to include in our liveout. */
433          for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
434             nir_block *succ = block->successors[j];
435             if (!succ || succ->index == impl->num_blocks)
436                continue;
437             struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
438 
439             uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
440             if (new_liveout) {
441                if (state.blocks[block->index].defout[i])
442                   c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
443                bs->liveout[i] |= sbs->livein[i];
444             }
445          }
446 
447          /* Propagate use requests from either our block's uses or our
448           * non-screened-off liveout up to our predecessors.
449           */
450          uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
451                                ~bs->livein[i]);
452          if (new_livein) {
453             bs->livein[i] |= new_livein;
454             set_foreach(block->predecessors, entry) {
455                nir_block *pred = (void *)entry->key;
456                nir_block_worklist_push_tail(&state.worklist, pred);
457             }
458 
459             if (new_livein & state.blocks[block->index].defin[i])
460                c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
461          }
462       }
463    }
464 
465    ralloc_free(state.blocks);
466    nir_block_worklist_fini(&state.worklist);
467 }
468 
469 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)470 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
471 {
472    if (index < c->first_non_array_temp)
473       return;
474 
475    if (c->liveness[index].start == ip && ra_map[index] == ~0)
476       ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
477 
478    if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
479       ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
480       BITSET_SET(released, index);
481    }
482 }
483 
484 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)485 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
486 {
487    ntt_live_regs(c, impl);
488 
489    unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
490    unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
491 
492    /* No RA on NIR array regs */
493    for (int i = 0; i < c->first_non_array_temp; i++)
494       ra_map[i] = i;
495 
496    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
497       ra_map[i] = ~0;
498 
499    int ip = 0;
500    nir_foreach_block(block, impl) {
501       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
502 
503       for (int i = 0; i < c->num_temps; i++)
504          ntt_ra_check(c, ra_map, released, ip, i);
505 
506       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
507          const struct tgsi_opcode_info *opcode_info =
508             tgsi_get_opcode_info(insn->opcode);
509 
510          for (int i = 0; i < opcode_info->num_src; i++) {
511             if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
512                ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
513                insn->src[i].Index = ra_map[insn->src[i].Index];
514             }
515          }
516 
517          if (insn->is_tex) {
518             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
519                if (insn->tex_offset[i].File == TGSI_FILE_TEMPORARY) {
520                   ntt_ra_check(c, ra_map, released, ip, insn->tex_offset[i].Index);
521                   insn->tex_offset[i].Index = ra_map[insn->tex_offset[i].Index];
522                }
523             }
524          }
525 
526          for (int i = 0; i < opcode_info->num_dst; i++) {
527             if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
528                ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
529                insn->dst[i].Index = ra_map[insn->dst[i].Index];
530             }
531          }
532          ip++;
533       }
534 
535       for (int i = 0; i < c->num_temps; i++)
536          ntt_ra_check(c, ra_map, released, ip, i);
537    }
538 }
539 
540 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)541 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
542 {
543    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
544       ureg_DECL_temporary(c->ureg);
545 }
546 
547 
548 /**
549  * Try to find an iadd of a constant value with a non-constant value in the
550  * nir_src's first component, returning the constant offset and replacing *src
551  * with the non-constant component.
552  */
553 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)554 ntt_extract_const_src_offset(nir_src *src)
555 {
556    nir_scalar s = nir_get_scalar(src->ssa, 0);
557 
558    while (nir_scalar_is_alu(s)) {
559       nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
560 
561       if (alu->op == nir_op_iadd) {
562          for (int i = 0; i < 2; i++) {
563             nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
564             if (v != NULL) {
565                *src = alu->src[1 - i].src;
566                return v[alu->src[i].swizzle[s.comp]].u32;
567             }
568          }
569 
570          return 0;
571       }
572 
573       /* We'd like to reuse nir_scalar_chase_movs(), but it assumes SSA and that
574        * seems reasonable for something used in inner loops of the compiler.
575        */
576       if (alu->op == nir_op_mov) {
577          s.def = alu->src[0].src.ssa;
578          s.comp = alu->src[0].swizzle[s.comp];
579       } else if (nir_op_is_vec(alu->op)) {
580          s.def = alu->src[s.comp].src.ssa;
581          s.comp = alu->src[s.comp].swizzle[0];
582       } else {
583          return 0;
584       }
585    }
586 
587    return 0;
588 }
589 
590 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)591 ntt_shader_input_type(struct ntt_compile *c,
592                       struct nir_variable *var)
593 {
594    switch (c->s->info.stage) {
595    case MESA_SHADER_GEOMETRY:
596    case MESA_SHADER_TESS_EVAL:
597    case MESA_SHADER_TESS_CTRL:
598       if (glsl_type_is_array(var->type))
599          return glsl_get_array_element(var->type);
600       else
601          return var->type;
602    default:
603       return var->type;
604    }
605 }
606 
607 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)608 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
609                             unsigned *semantic_name, unsigned *semantic_index)
610 {
611    /* We want to use most of tgsi_get_gl_varying_semantic(), but the
612     * !texcoord shifting has already been applied, so avoid that.
613     */
614    if (!c->needs_texcoord_semantic &&
615        (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
616       *semantic_name = TGSI_SEMANTIC_GENERIC;
617       *semantic_index = location - VARYING_SLOT_VAR0;
618       return;
619    }
620 
621    tgsi_get_gl_varying_semantic(location, true,
622                                 semantic_name, semantic_index);
623 }
624 
625 /* TGSI varying declarations have a component usage mask associated (used by
626  * r600 and svga).
627  */
628 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)629 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
630                     bool is_64)
631 {
632    uint32_t usage_mask =
633       u_bit_consecutive(start_component, num_components);
634 
635    if (is_64) {
636       if (start_component >= 2)
637          usage_mask >>= 2;
638 
639       uint32_t tgsi_usage_mask = 0;
640 
641       if (usage_mask & TGSI_WRITEMASK_X)
642          tgsi_usage_mask |= TGSI_WRITEMASK_XY;
643       if (usage_mask & TGSI_WRITEMASK_Y)
644          tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
645 
646       return tgsi_usage_mask;
647    } else {
648       return usage_mask;
649    }
650 }
651 
652 /* TGSI varying declarations have a component usage mask associated (used by
653  * r600 and svga).
654  */
655 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)656 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
657 {
658    const struct glsl_type *type_without_array =
659       glsl_without_array(var->type);
660    unsigned num_components = glsl_get_vector_elements(type_without_array);
661    if (num_components == 0) /* structs */
662       num_components = 4;
663 
664    return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
665                               glsl_type_is_64bit(type_without_array));
666 }
667 
668 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)669 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
670 {
671    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
672    int base = nir_intrinsic_base(instr);
673    *frac = nir_intrinsic_component(instr);
674    bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
675 
676    struct ureg_dst out;
677    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
678       unsigned semantic_name, semantic_index;
679       tgsi_get_gl_frag_result_semantic(semantics.location,
680                                        &semantic_name, &semantic_index);
681       semantic_index += semantics.dual_source_blend_index;
682 
683       switch (semantics.location) {
684       case FRAG_RESULT_DEPTH:
685          *frac = 2; /* z write is the to the .z channel in TGSI */
686          break;
687       case FRAG_RESULT_STENCIL:
688          *frac = 1;
689          break;
690       default:
691          break;
692       }
693 
694       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
695    } else {
696       unsigned semantic_name, semantic_index;
697 
698       ntt_get_gl_varying_semantic(c, semantics.location,
699                                   &semantic_name, &semantic_index);
700 
701       uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
702                                                 instr->num_components,
703                                                 is_64);
704       uint32_t gs_streams = semantics.gs_streams;
705       for (int i = 0; i < 4; i++) {
706          if (!(usage_mask & (1 << i)))
707             gs_streams &= ~(0x3 << 2 * i);
708       }
709 
710       /* No driver appears to use array_id of outputs. */
711       unsigned array_id = 0;
712 
713       /* This bit is lost in the i/o semantics, but it's unused in in-tree
714        * drivers.
715        */
716       bool invariant = semantics.invariant;
717 
718       unsigned num_slots = semantics.num_slots;
719       if (semantics.location == VARYING_SLOT_TESS_LEVEL_INNER ||
720           semantics.location == VARYING_SLOT_TESS_LEVEL_OUTER) {
721          /* Compact vars get a num_slots in NIR as number of components, but we
722           * want the number of vec4 slots here.
723           */
724          num_slots = 1;
725       }
726 
727       out = ureg_DECL_output_layout(c->ureg,
728                                     semantic_name, semantic_index,
729                                     gs_streams,
730                                     base,
731                                     usage_mask,
732                                     array_id,
733                                     num_slots,
734                                     invariant);
735    }
736 
737    unsigned write_mask;
738    if (nir_intrinsic_has_write_mask(instr))
739       write_mask = nir_intrinsic_write_mask(instr);
740    else
741       write_mask = ((1 << instr->num_components) - 1) << *frac;
742 
743    if (is_64) {
744       write_mask = ntt_64bit_write_mask(write_mask);
745       if (*frac >= 2)
746          write_mask = write_mask << 2;
747    } else {
748       write_mask = write_mask << *frac;
749    }
750    return ureg_writemask(out, write_mask);
751 }
752 
753 static bool
ntt_try_store_in_tgsi_output_with_use(struct ntt_compile * c,struct ureg_dst * dst,nir_src * src)754 ntt_try_store_in_tgsi_output_with_use(struct ntt_compile *c,
755                                       struct ureg_dst *dst,
756                                       nir_src *src)
757 {
758    *dst = ureg_dst_undef();
759 
760    switch (c->s->info.stage) {
761    case MESA_SHADER_FRAGMENT:
762    case MESA_SHADER_VERTEX:
763       break;
764    default:
765       /* tgsi_exec (at least) requires that output stores happen per vertex
766        * emitted, you don't get to reuse a previous output value for the next
767        * vertex.
768        */
769       return false;
770    }
771 
772    if (nir_src_is_if(src))
773       return false;
774 
775    if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
776       return false;
777 
778    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
779    if (intr->intrinsic != nir_intrinsic_store_output ||
780        !nir_src_is_const(intr->src[1])) {
781       return false;
782    }
783 
784    uint32_t frac;
785    *dst = ntt_output_decl(c, intr, &frac);
786    dst->Index += ntt_src_as_uint(c, intr->src[1]);
787 
788    return frac == 0;
789 }
790 
791 /* If this reg is used only for storing an output, then in the simple
792  * cases we can write directly to the TGSI output instead of having
793  * store_output emit its own MOV.
794  */
795 static bool
ntt_try_store_reg_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)796 ntt_try_store_reg_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
797                                  nir_intrinsic_instr *reg_decl)
798 {
799    assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
800 
801    *dst = ureg_dst_undef();
802 
803    /* Look for a single use for try_store_in_tgsi_output */
804    nir_src *use = NULL;
805    nir_foreach_reg_load(src, reg_decl) {
806       nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
807       nir_foreach_use_including_if(load_use, &load->def) {
808          /* We can only have one use */
809          if (use != NULL)
810             return false;
811 
812          use = load_use;
813       }
814    }
815 
816    if (use == NULL)
817       return false;
818 
819    return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
820 }
821 
822 /* If this SSA def is used only for storing an output, then in the simple
823  * cases we can write directly to the TGSI output instead of having
824  * store_output emit its own MOV.
825  */
826 static bool
ntt_try_store_ssa_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,nir_def * def)827 ntt_try_store_ssa_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
828                                  nir_def *def)
829 {
830    *dst = ureg_dst_undef();
831 
832    if (!list_is_singular(&def->uses))
833       return false;
834 
835    nir_foreach_use_including_if(use, def) {
836       return ntt_try_store_in_tgsi_output_with_use(c, dst, use);
837    }
838    unreachable("We have one use");
839 }
840 
841 static void
ntt_setup_inputs(struct ntt_compile * c)842 ntt_setup_inputs(struct ntt_compile *c)
843 {
844    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
845       return;
846 
847    unsigned num_inputs = 0;
848    int num_input_arrays = 0;
849 
850    nir_foreach_shader_in_variable(var, c->s) {
851       const struct glsl_type *type = ntt_shader_input_type(c, var);
852       unsigned array_len =
853          glsl_count_attribute_slots(type, false);
854 
855       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
856    }
857 
858    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
859 
860    nir_foreach_shader_in_variable(var, c->s) {
861       const struct glsl_type *type = ntt_shader_input_type(c, var);
862       unsigned array_len =
863          glsl_count_attribute_slots(type, false);
864 
865       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
866       unsigned sample_loc;
867       struct ureg_src decl;
868 
869       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
870          interpolation =
871             tgsi_get_interp_mode(var->data.interpolation,
872                                  var->data.location == VARYING_SLOT_COL0 ||
873                                  var->data.location == VARYING_SLOT_COL1);
874 
875          if (var->data.location == VARYING_SLOT_POS)
876             interpolation = TGSI_INTERPOLATE_LINEAR;
877       }
878 
879       unsigned semantic_name, semantic_index;
880       ntt_get_gl_varying_semantic(c, var->data.location,
881                                   &semantic_name, &semantic_index);
882 
883       if (var->data.sample) {
884          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
885       } else if (var->data.centroid) {
886          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
887          c->centroid_inputs |= (BITSET_MASK(array_len) <<
888                                 var->data.driver_location);
889       } else {
890          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
891       }
892 
893       unsigned array_id = 0;
894       if (glsl_type_is_array(type))
895          array_id = ++num_input_arrays;
896 
897       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
898 
899       decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
900                                                 semantic_name,
901                                                 semantic_index,
902                                                 interpolation,
903                                                 sample_loc,
904                                                 var->data.driver_location,
905                                                 usage_mask,
906                                                 array_id, array_len);
907 
908       if (semantic_name == TGSI_SEMANTIC_FACE) {
909          struct ureg_dst temp = ntt_temp(c);
910          if (c->native_integers) {
911             /* NIR is ~0 front and 0 back, while TGSI is +1 front */
912             ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
913          } else {
914             /* tgsi docs say that floating point FACE will be positive for
915              * frontface and negative for backface, but realistically
916              * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
917              * Copy that behavior, since some drivers (r300) have been doing a
918              * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
919              * front face).
920              */
921             temp.Saturate = true;
922             ntt_MOV(c, temp, decl);
923 
924          }
925          decl = ureg_src(temp);
926       }
927 
928       for (unsigned i = 0; i < array_len; i++) {
929          c->input_index_map[var->data.driver_location + i] = decl;
930          c->input_index_map[var->data.driver_location + i].Index += i;
931       }
932    }
933 }
934 
935 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)936 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
937 {
938    return a->data.location - b->data.location;
939 }
940 
941 /**
942  * Workaround for virglrenderer requiring that TGSI FS output color variables
943  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
944  */
945 static void
ntt_setup_outputs(struct ntt_compile * c)946 ntt_setup_outputs(struct ntt_compile *c)
947 {
948    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
949       return;
950 
951    nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
952 
953    nir_foreach_shader_out_variable(var, c->s) {
954       if (var->data.location == FRAG_RESULT_COLOR)
955          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
956 
957       unsigned semantic_name, semantic_index;
958       tgsi_get_gl_frag_result_semantic(var->data.location,
959                                        &semantic_name, &semantic_index);
960 
961       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
962    }
963 }
964 
965 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)966 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
967 {
968    switch (dim) {
969    case GLSL_SAMPLER_DIM_1D:
970       if (is_shadow)
971          return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
972       else
973          return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
974    case GLSL_SAMPLER_DIM_2D:
975    case GLSL_SAMPLER_DIM_EXTERNAL:
976       if (is_shadow)
977          return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
978       else
979          return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
980    case GLSL_SAMPLER_DIM_3D:
981       return TGSI_TEXTURE_3D;
982    case GLSL_SAMPLER_DIM_CUBE:
983       if (is_shadow)
984          return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
985       else
986          return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
987    case GLSL_SAMPLER_DIM_RECT:
988       if (is_shadow)
989          return TGSI_TEXTURE_SHADOWRECT;
990       else
991          return TGSI_TEXTURE_RECT;
992    case GLSL_SAMPLER_DIM_MS:
993       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
994    case GLSL_SAMPLER_DIM_BUF:
995       return TGSI_TEXTURE_BUFFER;
996    default:
997       unreachable("unknown sampler dim");
998    }
999 }
1000 
1001 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)1002 tgsi_return_type_from_base_type(enum glsl_base_type type)
1003 {
1004    switch (type) {
1005    case GLSL_TYPE_INT:
1006       return TGSI_RETURN_TYPE_SINT;
1007    case GLSL_TYPE_UINT:
1008       return TGSI_RETURN_TYPE_UINT;
1009    case GLSL_TYPE_FLOAT:
1010      return TGSI_RETURN_TYPE_FLOAT;
1011    default:
1012       unreachable("unexpected texture type");
1013    }
1014 }
1015 
1016 static void
ntt_setup_uniforms(struct ntt_compile * c)1017 ntt_setup_uniforms(struct ntt_compile *c)
1018 {
1019    nir_foreach_uniform_variable(var, c->s) {
1020       if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
1021           glsl_type_is_texture(glsl_without_array(var->type))) {
1022          /* Don't use this size for the check for samplers -- arrays of structs
1023           * containing samplers should be ignored, and just the separate lowered
1024           * sampler uniform decl used.
1025           */
1026          int size = glsl_type_get_sampler_count(var->type) +
1027                     glsl_type_get_texture_count(var->type);
1028 
1029          const struct glsl_type *stype = glsl_without_array(var->type);
1030          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
1031                                                                             glsl_sampler_type_is_array(stype),
1032                                                                             glsl_sampler_type_is_shadow(stype));
1033          enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
1034          for (int i = 0; i < size; i++) {
1035             ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
1036                target, ret_type, ret_type, ret_type, ret_type);
1037             ureg_DECL_sampler(c->ureg, var->data.binding + i);
1038          }
1039       } else if (glsl_contains_atomic(var->type)) {
1040          uint32_t offset = var->data.offset / 4;
1041          uint32_t size = glsl_atomic_size(var->type) / 4;
1042          ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
1043       }
1044 
1045       /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
1046        * size declaration happens with other UBOs below.
1047        */
1048    }
1049 
1050    nir_foreach_image_variable(var, c->s) {
1051       int image_count = glsl_type_get_image_count(var->type);
1052       const struct glsl_type *itype = glsl_without_array(var->type);
1053       enum tgsi_texture_type tex_type =
1054             tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
1055                                                glsl_sampler_type_is_array(itype), false);
1056 
1057       for (int i = 0; i < image_count; i++) {
1058          c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1059                                                         var->data.binding + i,
1060                                                         tex_type,
1061                                                         var->data.image.format,
1062                                                         !(var->data.access & ACCESS_NON_WRITEABLE),
1063                                                         false);
1064       }
1065    }
1066 
1067    c->first_ubo = ~0;
1068 
1069    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1070    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1071       int ubo = var->data.driver_location;
1072       if (ubo == -1)
1073          continue;
1074 
1075       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1076          c->first_ubo = MIN2(c->first_ubo, ubo);
1077 
1078       unsigned size = glsl_get_explicit_size(var->interface_type, false);
1079 
1080       int array_size = 1;
1081       if (glsl_type_is_interface(glsl_without_array(var->type)))
1082          array_size = MAX2(1, glsl_get_aoa_size(var->type));
1083 
1084       for (int i = 0; i < array_size; i++) {
1085          /* Even if multiple NIR variables are in the same uniform block, their
1086           * explicit size is the size of the block.
1087           */
1088          if (ubo_sizes[ubo + i])
1089             assert(ubo_sizes[ubo + i] == size);
1090 
1091          ubo_sizes[ubo + i] = size;
1092       }
1093    }
1094 
1095    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1096       if (ubo_sizes[i])
1097          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1098    }
1099 
1100    if (c->options->lower_ssbo_bindings) {
1101       c->first_ssbo = 255;
1102       nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
1103          if (c->first_ssbo > var->data.binding)
1104             c->first_ssbo = var->data.binding;
1105       }
1106    } else
1107       c->first_ssbo = 0;
1108 
1109    /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1110     * counters
1111     */
1112    bool atomic = false;
1113    for (int i = 0; i < c->s->info.num_ssbos; ++i)
1114       ureg_DECL_buffer(c->ureg, c->first_ssbo + i, atomic);
1115 
1116 }
1117 
1118 static void
ntt_setup_registers(struct ntt_compile * c)1119 ntt_setup_registers(struct ntt_compile *c)
1120 {
1121    assert(c->num_temps == 0);
1122 
1123    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1124       /* Permanently allocate all the array regs at the start. */
1125       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1126       unsigned index = nir_reg->def.index;
1127 
1128       if (num_array_elems != 0) {
1129          struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, num_array_elems, true);
1130          c->reg_temp[index] = decl;
1131          assert(c->num_temps == decl.Index);
1132          c->num_temps += num_array_elems;
1133       }
1134    }
1135    c->first_non_array_temp = c->num_temps;
1136 
1137    /* After that, allocate non-array regs in our virtual space that we'll
1138     * register-allocate before ureg emit.
1139     */
1140    nir_foreach_reg_decl_safe(nir_reg, nir_shader_get_entrypoint(c->s)) {
1141       unsigned num_array_elems = nir_intrinsic_num_array_elems(nir_reg);
1142       unsigned num_components = nir_intrinsic_num_components(nir_reg);
1143       unsigned bit_size = nir_intrinsic_bit_size(nir_reg);
1144       unsigned index = nir_reg->def.index;
1145 
1146       /* We already handled arrays */
1147       if (num_array_elems == 0) {
1148          struct ureg_dst decl;
1149          uint32_t write_mask = BITFIELD_MASK(num_components);
1150 
1151          if (!ntt_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
1152             if (bit_size == 64) {
1153                if (num_components > 2) {
1154                   fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1155                           num_components, index);
1156                }
1157 
1158                write_mask = ntt_64bit_write_mask(write_mask);
1159             }
1160 
1161             decl = ureg_writemask(ntt_temp(c), write_mask);
1162          }
1163          c->reg_temp[index] = decl;
1164       }
1165    }
1166 }
1167 
1168 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1169 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1170 {
1171    int num_components = instr->def.num_components;
1172 
1173    if (!c->native_integers) {
1174       float values[4];
1175       assert(instr->def.bit_size == 32);
1176       for (int i = 0; i < num_components; i++)
1177          values[i] = uif(instr->value[i].u32);
1178 
1179       return ureg_DECL_immediate(c->ureg, values, num_components);
1180    } else {
1181       uint32_t values[4];
1182 
1183       if (instr->def.bit_size == 32) {
1184          for (int i = 0; i < num_components; i++)
1185             values[i] = instr->value[i].u32;
1186       } else {
1187          assert(num_components <= 2);
1188          for (int i = 0; i < num_components; i++) {
1189             values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1190             values[i * 2 + 1] = instr->value[i].u64 >> 32;
1191          }
1192          num_components *= 2;
1193       }
1194 
1195       return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1196    }
1197 }
1198 
1199 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1200 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1201 {
1202    assert(addr_index < ARRAY_SIZE(c->addr_reg));
1203 
1204    for (int i = 0; i <= addr_index; i++) {
1205       if (!c->addr_declared[i]) {
1206          c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1207                                              TGSI_WRITEMASK_X);
1208          c->addr_declared[i] = true;
1209       }
1210    }
1211 
1212    if (c->native_integers)
1213       ntt_UARL(c, c->addr_reg[addr_index], addr);
1214    else
1215       ntt_ARL(c, c->addr_reg[addr_index], addr);
1216    return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1217 }
1218 
1219 /* Forward declare for recursion with indirects */
1220 static struct ureg_src
1221 ntt_get_src(struct ntt_compile *c, nir_src src);
1222 
1223 static struct ureg_src
ntt_get_chased_src(struct ntt_compile * c,nir_legacy_src * src)1224 ntt_get_chased_src(struct ntt_compile *c, nir_legacy_src *src)
1225 {
1226    if (src->is_ssa) {
1227       if (src->ssa->parent_instr->type == nir_instr_type_load_const)
1228          return ntt_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
1229 
1230       return c->ssa_temp[src->ssa->index];
1231    } else {
1232       struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
1233       reg_temp.Index += src->reg.base_offset;
1234 
1235       if (src->reg.indirect) {
1236          struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(src->reg.indirect));
1237          return ureg_src_indirect(ureg_src(reg_temp),
1238                                   ntt_reladdr(c, offset, 0));
1239       } else {
1240          return ureg_src(reg_temp);
1241       }
1242    }
1243 }
1244 
1245 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1246 ntt_get_src(struct ntt_compile *c, nir_src src)
1247 {
1248    nir_legacy_src chased = nir_legacy_chase_src(&src);
1249    return ntt_get_chased_src(c, &chased);
1250 }
1251 
1252 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1253 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1254 {
1255    /* We only support 32-bit float modifiers.  The only other modifier type
1256     * officially supported by TGSI is 32-bit integer negates, but even those are
1257     * broken on virglrenderer, so skip lowering all integer and f64 float mods.
1258     *
1259     * The options->lower_fabs requests that we not have native source modifiers
1260     * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
1261     */
1262    nir_legacy_alu_src src =
1263       nir_legacy_chase_alu_src(&instr->src[i], !c->options->lower_fabs);
1264    struct ureg_src usrc = ntt_get_chased_src(c, &src.src);
1265 
1266    /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1267     * channels.  We skip this for undefs, as those don't get split to vec2s (but
1268     * the specific swizzles from an undef don't matter)
1269     */
1270    if (nir_src_bit_size(instr->src[i].src) == 64 &&
1271       !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_undef)) {
1272       int chan1 = 1;
1273       if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1274          chan1 = instr->def.num_components > 1 ? 1 : 0;
1275       }
1276       usrc = ureg_swizzle(usrc,
1277                           src.swizzle[0] * 2,
1278                           src.swizzle[0] * 2 + 1,
1279                           src.swizzle[chan1] * 2,
1280                           src.swizzle[chan1] * 2 + 1);
1281    } else {
1282       usrc = ureg_swizzle(usrc,
1283                           src.swizzle[0],
1284                           src.swizzle[1],
1285                           src.swizzle[2],
1286                           src.swizzle[3]);
1287    }
1288 
1289    if (src.fabs)
1290       usrc = ureg_abs(usrc);
1291    if (src.fneg)
1292       usrc = ureg_negate(usrc);
1293 
1294    return usrc;
1295 }
1296 
1297 /* Reswizzles a source so that the unset channels in the write mask still refer
1298  * to one of the channels present in the write mask.
1299  */
1300 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1301 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1302 {
1303    assert(write_mask);
1304    int first_chan = ffs(write_mask) - 1;
1305    return ureg_swizzle(src,
1306                        (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1307                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1308                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1309                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1310 }
1311 
1312 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_def * ssa)1313 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_def *ssa)
1314 {
1315    uint32_t writemask = BITSET_MASK(ssa->num_components);
1316    if (ssa->bit_size == 64)
1317       writemask = ntt_64bit_write_mask(writemask);
1318 
1319    struct ureg_dst dst;
1320    if (!ntt_try_store_ssa_in_tgsi_output(c, &dst, ssa))
1321       dst = ntt_temp(c);
1322 
1323    c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1324 
1325    return ureg_writemask(dst, writemask);
1326 }
1327 
1328 static struct ureg_dst
ntt_get_chased_dest_decl(struct ntt_compile * c,nir_legacy_dest * dest)1329 ntt_get_chased_dest_decl(struct ntt_compile *c, nir_legacy_dest *dest)
1330 {
1331    if (dest->is_ssa)
1332       return ntt_get_ssa_def_decl(c, dest->ssa);
1333    else
1334       return c->reg_temp[dest->reg.handle->index];
1335 }
1336 
1337 static struct ureg_dst
ntt_get_chased_dest(struct ntt_compile * c,nir_legacy_dest * dest)1338 ntt_get_chased_dest(struct ntt_compile *c, nir_legacy_dest *dest)
1339 {
1340    struct ureg_dst dst = ntt_get_chased_dest_decl(c, dest);
1341 
1342    if (!dest->is_ssa) {
1343       dst.Index += dest->reg.base_offset;
1344 
1345       if (dest->reg.indirect) {
1346          struct ureg_src offset = ntt_get_src(c, nir_src_for_ssa(dest->reg.indirect));
1347          dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1348       }
1349    }
1350 
1351    return dst;
1352 }
1353 
1354 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_def * def)1355 ntt_get_dest(struct ntt_compile *c, nir_def *def)
1356 {
1357    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1358    return ntt_get_chased_dest(c, &chased);
1359 }
1360 
1361 static struct ureg_dst
ntt_get_alu_dest(struct ntt_compile * c,nir_def * def)1362 ntt_get_alu_dest(struct ntt_compile *c, nir_def *def)
1363 {
1364    nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
1365    struct ureg_dst dst = ntt_get_chased_dest(c, &chased.dest);
1366 
1367    if (chased.fsat)
1368       dst.Saturate = true;
1369 
1370    /* Only registers get write masks */
1371    if (chased.dest.is_ssa)
1372       return dst;
1373 
1374    int dst_64 = def->bit_size == 64;
1375    unsigned write_mask = chased.write_mask;
1376 
1377    if (dst_64)
1378       return ureg_writemask(dst, ntt_64bit_write_mask(write_mask));
1379    else
1380       return ureg_writemask(dst, write_mask);
1381 }
1382 
1383 /* For an SSA dest being populated by a constant src, replace the storage with
1384  * a copy of the ureg_src.
1385  */
1386 static void
ntt_store_def(struct ntt_compile * c,nir_def * def,struct ureg_src src)1387 ntt_store_def(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1388 {
1389    if (!src.Indirect && !src.DimIndirect) {
1390       switch (src.File) {
1391       case TGSI_FILE_IMMEDIATE:
1392       case TGSI_FILE_INPUT:
1393       case TGSI_FILE_CONSTANT:
1394       case TGSI_FILE_SYSTEM_VALUE:
1395          c->ssa_temp[def->index] = src;
1396          return;
1397       }
1398    }
1399 
1400    ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1401 }
1402 
1403 static void
ntt_store(struct ntt_compile * c,nir_def * def,struct ureg_src src)1404 ntt_store(struct ntt_compile *c, nir_def *def, struct ureg_src src)
1405 {
1406    nir_legacy_dest chased = nir_legacy_chase_dest(def);
1407 
1408    if (chased.is_ssa)
1409       ntt_store_def(c, chased.ssa, src);
1410    else {
1411       struct ureg_dst dst = ntt_get_chased_dest(c, &chased);
1412       ntt_MOV(c, dst, src);
1413    }
1414 }
1415 
1416 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1417 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1418                 struct ureg_dst dst,
1419                 struct ureg_src src0,
1420                 struct ureg_src src1)
1421 {
1422    unsigned i;
1423 
1424    /* POW is the only 2-operand scalar op. */
1425    if (tgsi_op != TGSI_OPCODE_POW)
1426       src1 = src0;
1427 
1428    for (i = 0; i < 4; i++) {
1429       if (dst.WriteMask & (1 << i)) {
1430          ntt_insn(c, tgsi_op,
1431                   ureg_writemask(dst, 1 << i),
1432                   ureg_scalar(src0, i),
1433                   ureg_scalar(src1, i),
1434                   ureg_src_undef(), ureg_src_undef());
1435       }
1436    }
1437 }
1438 
1439 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1440 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1441 {
1442    struct ureg_src src[4];
1443    struct ureg_dst dst;
1444    unsigned i;
1445    int dst_64 = instr->def.bit_size == 64;
1446    int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1447    int num_srcs = nir_op_infos[instr->op].num_inputs;
1448 
1449    /* Don't try to translate folded fsat since their source won't be valid */
1450    if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
1451       return;
1452 
1453    c->precise = instr->exact;
1454 
1455    assert(num_srcs <= ARRAY_SIZE(src));
1456    for (i = 0; i < num_srcs; i++)
1457       src[i] = ntt_get_alu_src(c, instr, i);
1458    for (; i < ARRAY_SIZE(src); i++)
1459       src[i] = ureg_src_undef();
1460 
1461    dst = ntt_get_alu_dest(c, &instr->def);
1462 
1463    static enum tgsi_opcode op_map[][2] = {
1464       [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1465 
1466       /* fabs/fneg 32-bit are special-cased below. */
1467       [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1468       [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1469 
1470       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1471       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1472       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1473       [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1474       [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1475       [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1476       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1477       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1478       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1479       [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1480       [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1481       [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1482       [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1483 
1484       [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1485       [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1486       [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1487 
1488       /* The conversions will have one combination of src and dst bitsize. */
1489       [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1490       [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1491       [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1492 
1493       [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1494       [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1495       [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1496       [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1497       [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1498       [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1499       [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1500       [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1501 
1502       [nir_op_slt] = { TGSI_OPCODE_SLT },
1503       [nir_op_sge] = { TGSI_OPCODE_SGE },
1504       [nir_op_seq] = { TGSI_OPCODE_SEQ },
1505       [nir_op_sne] = { TGSI_OPCODE_SNE },
1506 
1507       [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1508       [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1509       [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1510       [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1511 
1512       [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1513       [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1514       [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1515       [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1516 
1517       [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1518       [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1519 
1520       [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1521       [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1522       [nir_op_fsign] = { TGSI_OPCODE_SSG, TGSI_OPCODE_DSSG },
1523       [nir_op_isign] = { TGSI_OPCODE_ISSG, TGSI_OPCODE_I64SSG },
1524       [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1525       [nir_op_fddx] = { TGSI_OPCODE_DDX },
1526       [nir_op_fddy] = { TGSI_OPCODE_DDY },
1527       [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1528       [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1529       [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1530       [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1531       [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1532       [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1533       [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1534       [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1535       [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1536       [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1537       [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1538       [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1539       [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1540       [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1541       [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1542       [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1543       [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1544       [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1545       [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1546       [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1547       [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1548       [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1549       [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1550       [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1551       [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1552 
1553       /* These bitwise ops don't care about 32 vs 64 types, so they have the
1554        * same TGSI op.
1555        */
1556       [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1557       [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1558       [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1559       [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1560 
1561       [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1562       [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1563       [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1564       [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1565       [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1566       [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1567       [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1568       [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1569    };
1570 
1571    if (src_64 && !dst_64) {
1572       if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1573          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1574          * of .xy.
1575          */
1576          assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1577       } else {
1578          /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1579          * channels and will need to get fixed up.
1580          */
1581         assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1582       }
1583    }
1584 
1585    bool table_op64 = src_64;
1586    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1587       /* The normal path for NIR to TGSI ALU op translation */
1588       ntt_insn(c, op_map[instr->op][table_op64],
1589                 dst, src[0], src[1], src[2], src[3]);
1590    } else {
1591       /* Special cases for NIR to TGSI ALU op translation. */
1592 
1593       /* TODO: Use something like the ntt_store() path for the MOV calls so we
1594        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1595        */
1596 
1597       switch (instr->op) {
1598       case nir_op_u2u64:
1599          ntt_AND(c, dst, ureg_swizzle(src[0],
1600                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1601                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1602                   ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1603          break;
1604 
1605       case nir_op_i2i32:
1606       case nir_op_u2u32:
1607          assert(src_64);
1608          ntt_MOV(c, dst, ureg_swizzle(src[0],
1609                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1610                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1611          break;
1612 
1613       case nir_op_fabs:
1614          /* Try to eliminate */
1615          if (!c->options->lower_fabs && nir_legacy_float_mod_folds(instr))
1616             break;
1617 
1618          if (c->options->lower_fabs)
1619             ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1620          else
1621             ntt_MOV(c, dst, ureg_abs(src[0]));
1622          break;
1623 
1624       case nir_op_fsat:
1625          if (dst_64) {
1626             ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1627             ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1628          } else {
1629             ntt_MOV(c, ureg_saturate(dst), src[0]);
1630          }
1631          break;
1632 
1633       case nir_op_fneg:
1634          /* Try to eliminate */
1635          if (nir_legacy_float_mod_folds(instr))
1636             break;
1637 
1638          ntt_MOV(c, dst, ureg_negate(src[0]));
1639          break;
1640 
1641          /* NOTE: TGSI 32-bit math ops have the old "one source channel
1642           * replicated to all dst channels" behavior, while 64 is normal mapping
1643           * of src channels to dst.
1644           */
1645       case nir_op_frcp:
1646          assert(!dst_64);
1647          ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1648          break;
1649 
1650       case nir_op_frsq:
1651          assert(!dst_64);
1652          ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1653          break;
1654 
1655       case nir_op_fsqrt:
1656          assert(!dst_64);
1657          ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1658          break;
1659 
1660       case nir_op_fexp2:
1661          assert(!dst_64);
1662          ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1663          break;
1664 
1665       case nir_op_flog2:
1666          assert(!dst_64);
1667          ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1668          break;
1669 
1670       case nir_op_b2f32:
1671          ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1672          break;
1673 
1674       case nir_op_b2f64:
1675          ntt_AND(c, dst,
1676                   ureg_swizzle(src[0],
1677                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1678                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1679                   ntt_64bit_1f(c));
1680          break;
1681 
1682       case nir_op_b2i32:
1683          ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1684          break;
1685 
1686       case nir_op_b2i64:
1687          ntt_AND(c, dst,
1688                   ureg_swizzle(src[0],
1689                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1690                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1691                   ureg_imm4u(c->ureg, 1, 0, 1, 0));
1692          break;
1693 
1694       case nir_op_fsin:
1695          ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1696          break;
1697 
1698       case nir_op_fcos:
1699          ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1700          break;
1701 
1702       case nir_op_fsub:
1703          assert(!dst_64);
1704          ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1705          break;
1706 
1707       case nir_op_isub:
1708          assert(!dst_64);
1709          ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1710          break;
1711 
1712       case nir_op_fmod:
1713          unreachable("should be handled by .lower_fmod = true");
1714          break;
1715 
1716       case nir_op_fpow:
1717          ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1718          break;
1719 
1720       case nir_op_flrp:
1721          ntt_LRP(c, dst, src[2], src[1], src[0]);
1722          break;
1723 
1724       case nir_op_pack_64_2x32_split:
1725          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1726                   ureg_swizzle(src[0],
1727                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1728                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1729          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1730                   ureg_swizzle(src[1],
1731                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1732                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1733          break;
1734 
1735       case nir_op_unpack_64_2x32_split_x:
1736          ntt_MOV(c, dst, ureg_swizzle(src[0],
1737                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1738                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1739          break;
1740 
1741       case nir_op_unpack_64_2x32_split_y:
1742          ntt_MOV(c, dst, ureg_swizzle(src[0],
1743                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1744                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1745          break;
1746 
1747       case nir_op_b32csel:
1748          if (nir_src_bit_size(instr->src[1].src) == 64) {
1749             ntt_UCMP(c, dst, ureg_swizzle(src[0],
1750                                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1751                                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1752                       src[1], src[2]);
1753          } else {
1754             ntt_UCMP(c, dst, src[0], src[1], src[2]);
1755          }
1756          break;
1757 
1758       case nir_op_fcsel:
1759          /* If CMP isn't supported, then the flags that enable NIR to generate
1760           * this opcode should also not be set.
1761           */
1762          assert(!c->options->lower_cmp);
1763 
1764          /* Implement this as CMP(-abs(src0), src1, src2). */
1765          ntt_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
1766          break;
1767 
1768       case nir_op_fcsel_gt:
1769          /* If CMP isn't supported, then the flags that enable NIR to generate
1770           * these opcodes should also not be set.
1771           */
1772          assert(!c->options->lower_cmp);
1773 
1774          ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1775          break;
1776 
1777       case nir_op_fcsel_ge:
1778          /* If CMP isn't supported, then the flags that enable NIR to generate
1779           * these opcodes should also not be set.
1780           */
1781          assert(!c->options->lower_cmp);
1782 
1783          /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
1784          ntt_CMP(c, dst, src[0], src[2], src[1]);
1785          break;
1786 
1787       case nir_op_frexp_sig:
1788       case nir_op_frexp_exp:
1789          unreachable("covered by nir_lower_frexp()");
1790          break;
1791 
1792       case nir_op_ldexp:
1793          assert(dst_64); /* 32bit handled in table. */
1794          ntt_DLDEXP(c, dst, src[0],
1795                      ureg_swizzle(src[1],
1796                                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1797                                   TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1798          break;
1799 
1800       case nir_op_vec4:
1801       case nir_op_vec3:
1802       case nir_op_vec2:
1803          unreachable("covered by nir_lower_vec_to_movs()");
1804 
1805       default:
1806          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1807          unreachable("Unknown NIR opcode");
1808       }
1809    }
1810 
1811    c->precise = false;
1812 }
1813 
1814 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1815 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1816                       nir_src src, int addr_reg)
1817 {
1818    if (nir_src_is_const(src)) {
1819       usrc.Index += ntt_src_as_uint(c, src);
1820       return usrc;
1821    } else {
1822       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1823    }
1824 }
1825 
1826 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1827 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1828                       nir_src src)
1829 {
1830    if (nir_src_is_const(src)) {
1831       dst.Index += ntt_src_as_uint(c, src);
1832       return dst;
1833    } else {
1834       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1835    }
1836 }
1837 
1838 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1839 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1840                          nir_src src)
1841 {
1842    if (nir_src_is_const(src)) {
1843       return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1844    }
1845    else
1846    {
1847       return ureg_src_dimension_indirect(usrc,
1848                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1849                                          0);
1850    }
1851 }
1852 
1853 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1854 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1855                                 nir_src src)
1856 {
1857    if (nir_src_is_const(src)) {
1858       return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1859    } else {
1860       return ureg_dst_dimension_indirect(udst,
1861                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1862                                          0);
1863    }
1864 }
1865 /* Some load operations in NIR will have a fractional offset that we need to
1866  * swizzle down before storing to the result register.
1867  */
1868 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1869 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1870 {
1871    return ureg_swizzle(src,
1872                        frac,
1873                        frac + MIN2(num_components - 1, 1),
1874                        frac + MIN2(num_components - 1, 2),
1875                        frac + MIN2(num_components - 1, 3));
1876 }
1877 
1878 
1879 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1880 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1881 {
1882    int bit_size = instr->def.bit_size;
1883    assert(bit_size == 32 || instr->num_components <= 2);
1884 
1885    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1886 
1887    struct ureg_dst addr_temp = ureg_dst_undef();
1888 
1889    if (nir_src_is_const(instr->src[0])) {
1890       src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1891    } else {
1892       /* virglrenderer requires that indirect UBO references have the UBO
1893        * array's base index in the Index field, not added to the indrect
1894        * address.
1895        *
1896        * Many nir intrinsics have a base address const value for the start of
1897        * their array indirection, but load_ubo doesn't.  We fake it by
1898        * subtracting it off here.
1899        */
1900       addr_temp = ntt_temp(c);
1901       ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1902       src = ureg_src_dimension_indirect(src,
1903                                          ntt_reladdr(c, ureg_src(addr_temp), 1),
1904                                          c->first_ubo);
1905    }
1906 
1907    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1908       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1909        * file.
1910        */
1911       src.Index = nir_intrinsic_base(instr);
1912 
1913       if (nir_src_is_const(instr->src[1])) {
1914          src.Index += ntt_src_as_uint(c, instr->src[1]);
1915       } else {
1916          src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1917       }
1918 
1919       int start_component = nir_intrinsic_component(instr);
1920       if (bit_size == 64)
1921          start_component *= 2;
1922 
1923       src = ntt_shift_by_frac(src, start_component,
1924                               instr->num_components * bit_size / 32);
1925 
1926       ntt_store(c, &instr->def, src);
1927    } else {
1928       /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1929        * TGSI_OPCODE_LOAD instruction from the const file.
1930        */
1931       struct ntt_insn *insn =
1932          ntt_insn(c, TGSI_OPCODE_LOAD,
1933                   ntt_get_dest(c, &instr->def),
1934                   src, ntt_get_src(c, instr->src[1]),
1935                   ureg_src_undef(), ureg_src_undef());
1936       insn->is_mem = true;
1937       insn->tex_target = 0;
1938       insn->mem_qualifier = 0;
1939       insn->mem_format = 0; /* unused */
1940    }
1941 }
1942 
1943 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1944 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1945 {
1946    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1947    unsigned qualifier = 0;
1948 
1949    if (access & ACCESS_COHERENT)
1950       qualifier |= TGSI_MEMORY_COHERENT;
1951    if (access & ACCESS_VOLATILE)
1952       qualifier |= TGSI_MEMORY_VOLATILE;
1953    if (access & ACCESS_RESTRICT)
1954       qualifier |= TGSI_MEMORY_RESTRICT;
1955 
1956    return qualifier;
1957 }
1958 
1959 static unsigned
ntt_translate_atomic_op(nir_atomic_op op)1960 ntt_translate_atomic_op(nir_atomic_op op)
1961 {
1962    switch (op) {
1963    case nir_atomic_op_iadd: return TGSI_OPCODE_ATOMUADD;
1964    case nir_atomic_op_fadd: return TGSI_OPCODE_ATOMFADD;
1965    case nir_atomic_op_imin: return TGSI_OPCODE_ATOMIMIN;
1966    case nir_atomic_op_imax: return TGSI_OPCODE_ATOMIMAX;
1967    case nir_atomic_op_umin: return TGSI_OPCODE_ATOMUMIN;
1968    case nir_atomic_op_umax: return TGSI_OPCODE_ATOMUMAX;
1969    case nir_atomic_op_iand: return TGSI_OPCODE_ATOMAND;
1970    case nir_atomic_op_ixor: return TGSI_OPCODE_ATOMXOR;
1971    case nir_atomic_op_ior:  return TGSI_OPCODE_ATOMOR;
1972    case nir_atomic_op_xchg: return TGSI_OPCODE_ATOMXCHG;
1973    default: unreachable("invalid atomic");
1974    }
1975 }
1976 
1977 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1978 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1979              nir_variable_mode mode)
1980 {
1981    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1982                     instr->intrinsic == nir_intrinsic_store_shared);
1983    bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1984                     instr->intrinsic == nir_intrinsic_load_ssbo ||
1985                     instr->intrinsic == nir_intrinsic_load_shared);
1986    unsigned opcode;
1987    struct ureg_src src[4];
1988    int num_src = 0;
1989    int next_src;
1990    struct ureg_dst addr_temp = ureg_dst_undef();
1991 
1992    struct ureg_src memory;
1993    switch (mode) {
1994    case nir_var_mem_ssbo:
1995       memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER,
1996                                                           c->first_ssbo),
1997                                      instr->src[is_store ? 1 : 0], 2);
1998       next_src = 1;
1999       break;
2000    case nir_var_mem_shared:
2001       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
2002       next_src = 0;
2003       break;
2004    case nir_var_uniform: { /* HW atomic buffers */
2005       nir_src src = instr->src[0];
2006       uint32_t offset = (ntt_extract_const_src_offset(&src) +
2007                          nir_intrinsic_range_base(instr)) / 4;
2008 
2009       memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
2010       /* ntt_ureg_src_indirect, except dividing by 4 */
2011       if (nir_src_is_const(src)) {
2012          memory.Index += nir_src_as_uint(src) / 4;
2013       } else {
2014          addr_temp = ntt_temp(c);
2015          ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
2016          memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
2017       }
2018       memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
2019       next_src = 0;
2020       break;
2021    }
2022 
2023    default:
2024       unreachable("unknown memory type");
2025    }
2026 
2027    if (is_store) {
2028       src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
2029       src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
2030    } else {
2031       src[num_src++] = memory;
2032       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2033          src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
2034          switch (instr->intrinsic) {
2035          case nir_intrinsic_atomic_counter_inc:
2036             src[num_src++] = ureg_imm1i(c->ureg, 1);
2037             break;
2038          case nir_intrinsic_atomic_counter_post_dec:
2039             src[num_src++] = ureg_imm1i(c->ureg, -1);
2040             break;
2041          default:
2042             if (!is_load)
2043                src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
2044             break;
2045          }
2046       }
2047    }
2048 
2049 
2050    switch (instr->intrinsic) {
2051    case nir_intrinsic_ssbo_atomic:
2052    case nir_intrinsic_shared_atomic:
2053       opcode = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2054       break;
2055    case nir_intrinsic_atomic_counter_add:
2056    case nir_intrinsic_atomic_counter_inc:
2057    case nir_intrinsic_atomic_counter_post_dec:
2058       opcode = TGSI_OPCODE_ATOMUADD;
2059       break;
2060    case nir_intrinsic_atomic_counter_min:
2061       opcode = TGSI_OPCODE_ATOMIMIN;
2062       break;
2063    case nir_intrinsic_atomic_counter_max:
2064       opcode = TGSI_OPCODE_ATOMIMAX;
2065       break;
2066    case nir_intrinsic_atomic_counter_and:
2067       opcode = TGSI_OPCODE_ATOMAND;
2068       break;
2069    case nir_intrinsic_atomic_counter_or:
2070       opcode = TGSI_OPCODE_ATOMOR;
2071       break;
2072    case nir_intrinsic_atomic_counter_xor:
2073       opcode = TGSI_OPCODE_ATOMXOR;
2074       break;
2075    case nir_intrinsic_atomic_counter_exchange:
2076       opcode = TGSI_OPCODE_ATOMXCHG;
2077       break;
2078    case nir_intrinsic_atomic_counter_comp_swap:
2079    case nir_intrinsic_ssbo_atomic_swap:
2080    case nir_intrinsic_shared_atomic_swap:
2081       opcode = TGSI_OPCODE_ATOMCAS;
2082       src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
2083       break;
2084    case nir_intrinsic_atomic_counter_read:
2085    case nir_intrinsic_load_ssbo:
2086    case nir_intrinsic_load_shared:
2087       opcode = TGSI_OPCODE_LOAD;
2088       break;
2089    case nir_intrinsic_store_ssbo:
2090    case nir_intrinsic_store_shared:
2091       opcode = TGSI_OPCODE_STORE;
2092       break;
2093    case nir_intrinsic_get_ssbo_size:
2094       opcode = TGSI_OPCODE_RESQ;
2095       break;
2096    default:
2097       unreachable("unknown memory op");
2098    }
2099 
2100    unsigned qualifier = 0;
2101    if (mode == nir_var_mem_ssbo &&
2102        instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2103       qualifier = ntt_get_access_qualifier(instr);
2104    }
2105 
2106    struct ureg_dst dst;
2107    if (is_store) {
2108       dst = ureg_dst(memory);
2109 
2110       unsigned write_mask = nir_intrinsic_write_mask(instr);
2111       if (nir_src_bit_size(instr->src[0]) == 64)
2112          write_mask = ntt_64bit_write_mask(write_mask);
2113       dst = ureg_writemask(dst, write_mask);
2114    } else {
2115       dst = ntt_get_dest(c, &instr->def);
2116    }
2117 
2118    struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2119    insn->tex_target = TGSI_TEXTURE_BUFFER;
2120    insn->mem_qualifier = qualifier;
2121    insn->mem_format = 0; /* unused */
2122    insn->is_mem = true;
2123 }
2124 
2125 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2126 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2127 {
2128    unsigned op;
2129    struct ureg_src srcs[4];
2130    int num_src = 0;
2131    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2132    bool is_array = nir_intrinsic_image_array(instr);
2133 
2134    struct ureg_dst temp = ureg_dst_undef();
2135 
2136    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2137 
2138    struct ureg_src resource;
2139    switch (instr->intrinsic) {
2140    case nir_intrinsic_bindless_image_load:
2141    case nir_intrinsic_bindless_image_store:
2142    case nir_intrinsic_bindless_image_size:
2143    case nir_intrinsic_bindless_image_samples:
2144    case nir_intrinsic_bindless_image_atomic:
2145    case nir_intrinsic_bindless_image_atomic_swap:
2146       resource = ntt_get_src(c, instr->src[0]);
2147       break;
2148    default:
2149       resource = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2150                                        instr->src[0], 2);
2151       resource.Index += nir_intrinsic_range_base(instr);
2152    }
2153 
2154    struct ureg_dst dst;
2155    if (instr->intrinsic == nir_intrinsic_image_store ||
2156        instr->intrinsic == nir_intrinsic_bindless_image_store) {
2157       dst = ureg_dst(resource);
2158    } else {
2159       srcs[num_src++] = resource;
2160       dst = ntt_get_dest(c, &instr->def);
2161    }
2162    struct ureg_dst opcode_dst = dst;
2163 
2164    if (instr->intrinsic != nir_intrinsic_image_size &&
2165        instr->intrinsic != nir_intrinsic_image_samples &&
2166        instr->intrinsic != nir_intrinsic_bindless_image_size &&
2167        instr->intrinsic != nir_intrinsic_bindless_image_samples) {
2168       struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2169 
2170       if (dim == GLSL_SAMPLER_DIM_MS) {
2171          temp = ntt_temp(c);
2172          ntt_MOV(c, temp, coord);
2173          ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2174                   ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2175          coord = ureg_src(temp);
2176       }
2177       srcs[num_src++] = coord;
2178 
2179       if (instr->intrinsic != nir_intrinsic_image_load &&
2180           instr->intrinsic != nir_intrinsic_bindless_image_load) {
2181          srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2182          if (instr->intrinsic == nir_intrinsic_image_atomic_swap ||
2183              instr->intrinsic == nir_intrinsic_bindless_image_atomic_swap)
2184             srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2185       }
2186    }
2187 
2188    switch (instr->intrinsic) {
2189    case nir_intrinsic_image_load:
2190    case nir_intrinsic_bindless_image_load:
2191       op = TGSI_OPCODE_LOAD;
2192       break;
2193    case nir_intrinsic_image_store:
2194    case nir_intrinsic_bindless_image_store:
2195       op = TGSI_OPCODE_STORE;
2196       break;
2197    case nir_intrinsic_image_size:
2198    case nir_intrinsic_bindless_image_size:
2199       op = TGSI_OPCODE_RESQ;
2200       break;
2201    case nir_intrinsic_image_samples:
2202    case nir_intrinsic_bindless_image_samples:
2203       op = TGSI_OPCODE_RESQ;
2204       opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2205       break;
2206    case nir_intrinsic_image_atomic:
2207    case nir_intrinsic_bindless_image_atomic:
2208       op = ntt_translate_atomic_op(nir_intrinsic_atomic_op(instr));
2209       break;
2210    case nir_intrinsic_image_atomic_swap:
2211    case nir_intrinsic_bindless_image_atomic_swap:
2212       op = TGSI_OPCODE_ATOMCAS;
2213       break;
2214    default:
2215       unreachable("bad op");
2216    }
2217 
2218    struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2219    insn->tex_target = target;
2220    insn->mem_qualifier = ntt_get_access_qualifier(instr);
2221    insn->mem_format = nir_intrinsic_format(instr);
2222    insn->is_mem = true;
2223 
2224    if (instr->intrinsic == nir_intrinsic_image_samples ||
2225        instr->intrinsic == nir_intrinsic_bindless_image_samples)
2226       ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2227 }
2228 
2229 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2230 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2231 {
2232    uint32_t frac = nir_intrinsic_component(instr);
2233    uint32_t num_components = instr->num_components;
2234    unsigned base = nir_intrinsic_base(instr);
2235    struct ureg_src input;
2236    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2237    bool is_64 = instr->def.bit_size == 64;
2238 
2239    if (c->s->info.stage == MESA_SHADER_VERTEX) {
2240       input = ureg_DECL_vs_input(c->ureg, base);
2241       for (int i = 1; i < semantics.num_slots; i++)
2242          ureg_DECL_vs_input(c->ureg, base + i);
2243    } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2244       unsigned semantic_name, semantic_index;
2245       ntt_get_gl_varying_semantic(c, semantics.location,
2246                                   &semantic_name, &semantic_index);
2247 
2248       /* XXX: ArrayID is used in r600 gs inputs */
2249       uint32_t array_id = 0;
2250 
2251       input = ureg_DECL_input_layout(c->ureg,
2252                                      semantic_name,
2253                                      semantic_index,
2254                                      base,
2255                                      ntt_tgsi_usage_mask(frac,
2256                                                          instr->num_components,
2257                                                          is_64),
2258                                      array_id,
2259                                      semantics.num_slots);
2260    } else {
2261       input = c->input_index_map[base];
2262    }
2263 
2264    if (is_64)
2265       num_components *= 2;
2266 
2267    input = ntt_shift_by_frac(input, frac, num_components);
2268 
2269    switch (instr->intrinsic) {
2270    case nir_intrinsic_load_input:
2271       input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2272       ntt_store(c, &instr->def, input);
2273       break;
2274 
2275    case nir_intrinsic_load_per_vertex_input:
2276       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2277       input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2278       ntt_store(c, &instr->def, input);
2279       break;
2280 
2281    case nir_intrinsic_load_interpolated_input: {
2282       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2283 
2284       nir_intrinsic_instr *bary_instr =
2285          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2286 
2287       switch (bary_instr->intrinsic) {
2288       case nir_intrinsic_load_barycentric_pixel:
2289       case nir_intrinsic_load_barycentric_sample:
2290          /* For these, we know that the barycentric load matches the
2291           * interpolation on the input declaration, so we can use it directly.
2292           */
2293          ntt_store(c, &instr->def, input);
2294          break;
2295 
2296       case nir_intrinsic_load_barycentric_centroid:
2297          /* If the input was declared centroid, then there's no need to
2298           * emit the extra TGSI interp instruction, we can just read the
2299           * input.
2300           */
2301          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2302             ntt_store(c, &instr->def, input);
2303          } else {
2304             ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->def), input);
2305          }
2306          break;
2307 
2308       case nir_intrinsic_load_barycentric_at_sample:
2309          /* We stored the sample in the fake "bary" dest. */
2310          ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->def), input,
2311                             ntt_get_src(c, instr->src[0]));
2312          break;
2313 
2314       case nir_intrinsic_load_barycentric_at_offset:
2315          /* We stored the offset in the fake "bary" dest. */
2316          ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->def), input,
2317                             ntt_get_src(c, instr->src[0]));
2318          break;
2319 
2320       default:
2321          unreachable("bad barycentric interp intrinsic\n");
2322       }
2323       break;
2324    }
2325 
2326    default:
2327       unreachable("bad load input intrinsic\n");
2328    }
2329 }
2330 
2331 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2332 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2333 {
2334    struct ureg_src src = ntt_get_src(c, instr->src[0]);
2335 
2336    if (src.File == TGSI_FILE_OUTPUT) {
2337       /* If our src is the output file, that's an indication that we were able
2338        * to emit the output stores in the generating instructions and we have
2339        * nothing to do here.
2340        */
2341       return;
2342    }
2343 
2344    uint32_t frac;
2345    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2346 
2347    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2348       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2349       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2350    } else {
2351       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2352    }
2353 
2354    uint8_t swizzle[4] = { 0, 0, 0, 0 };
2355    for (int i = frac; i <= 4; i++) {
2356       if (out.WriteMask & (1 << i))
2357          swizzle[i] = i - frac;
2358    }
2359 
2360    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2361 
2362    ntt_MOV(c, out, src);
2363 }
2364 
2365 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2366 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2367 {
2368    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2369 
2370    /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2371     * load_output is present.
2372     */
2373    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2374           (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2375 
2376    uint32_t frac;
2377    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2378 
2379    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2380       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2381       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2382    } else {
2383       out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2384    }
2385 
2386    struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2387    struct ureg_src out_src = ureg_src(out);
2388 
2389    /* Don't swizzling unavailable channels of the output in the writemasked-out
2390     * components. Avoids compile failures in virglrenderer with
2391     * TESS_LEVEL_INNER.
2392     */
2393    int fill_channel = ffs(dst.WriteMask) - 1;
2394    uint8_t swizzles[4] = { 0, 1, 2, 3 };
2395    for (int i = 0; i < 4; i++)
2396       if (!(dst.WriteMask & (1 << i)))
2397          swizzles[i] = fill_channel;
2398    out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2399 
2400    if (semantics.fb_fetch_output)
2401       ntt_FBFETCH(c, dst, out_src);
2402    else
2403       ntt_MOV(c, dst, out_src);
2404 }
2405 
2406 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2407 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2408 {
2409    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2410    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2411    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2412 
2413    /* virglrenderer doesn't like references to channels of the sysval that
2414     * aren't defined, even if they aren't really read.  (GLSL compile fails on
2415     * gl_NumWorkGroups.w, for example).
2416     */
2417    uint32_t write_mask = BITSET_MASK(instr->def.num_components);
2418    sv = ntt_swizzle_for_write_mask(sv, write_mask);
2419 
2420    /* TGSI and NIR define these intrinsics as always loading ints, but they can
2421     * still appear on hardware with non-native-integers fragment shaders using
2422     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
2423     * means that we actually want floats instead.
2424     */
2425    if (!c->native_integers) {
2426       switch (instr->intrinsic) {
2427       case nir_intrinsic_load_vertex_id:
2428       case nir_intrinsic_load_instance_id:
2429          ntt_U2F(c, ntt_get_dest(c, &instr->def), sv);
2430          return;
2431 
2432       default:
2433          break;
2434       }
2435    }
2436 
2437    ntt_store(c, &instr->def, sv);
2438 }
2439 
2440 static void
ntt_emit_barrier(struct ntt_compile * c,nir_intrinsic_instr * intr)2441 ntt_emit_barrier(struct ntt_compile *c, nir_intrinsic_instr *intr)
2442 {
2443    bool compute = gl_shader_stage_is_compute(c->s->info.stage);
2444 
2445    if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) {
2446       nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
2447       unsigned membar = 0;
2448 
2449       if (modes & nir_var_image)
2450          membar |= TGSI_MEMBAR_SHADER_IMAGE;
2451 
2452       if (modes & nir_var_mem_shared)
2453          membar |= TGSI_MEMBAR_SHARED;
2454 
2455       /* Atomic counters are lowered to SSBOs, there's no NIR mode corresponding
2456        * exactly to atomics. Take the closest match.
2457        */
2458       if (modes & nir_var_mem_ssbo)
2459          membar |= TGSI_MEMBAR_SHADER_BUFFER | TGSI_MEMBAR_ATOMIC_BUFFER;
2460 
2461       if (modes & nir_var_mem_global)
2462          membar |= TGSI_MEMBAR_SHADER_BUFFER;
2463 
2464       /* Hack for virglrenderer: the GLSL specific memory barrier functions,
2465        * memoryBarrier{Buffer,Image,Shared,AtomicCounter}(), are only
2466        * available in compute shaders prior to GLSL 4.30.  In other stages,
2467        * it needs to use the full memoryBarrier().  It may be possible to
2468        * make them available via #extension directives in older versions,
2469        * but it's confusingly underspecified, and Mesa/virglrenderer don't
2470        * currently agree on how to do it.  So, just promote partial memory
2471        * barriers back to full ones outside of compute shaders when asked.
2472        */
2473       if (membar && !compute &&
2474           c->options->non_compute_membar_needs_all_modes) {
2475          membar |= TGSI_MEMBAR_SHADER_BUFFER |
2476                    TGSI_MEMBAR_ATOMIC_BUFFER |
2477                    TGSI_MEMBAR_SHADER_IMAGE |
2478                    TGSI_MEMBAR_SHARED;
2479       }
2480 
2481       /* If we only need workgroup scope (not device-scope), we might be able to
2482        * optimize a bit.
2483        */
2484       if (membar && compute &&
2485           nir_intrinsic_memory_scope(intr) == SCOPE_WORKGROUP) {
2486 
2487          membar |= TGSI_MEMBAR_THREAD_GROUP;
2488       }
2489 
2490       /* Only emit a memory barrier if there are any relevant modes */
2491       if (membar)
2492          ntt_MEMBAR(c, ureg_imm1u(c->ureg, membar));
2493    }
2494 
2495    if (nir_intrinsic_execution_scope(intr) != SCOPE_NONE) {
2496       assert(compute || c->s->info.stage == MESA_SHADER_TESS_CTRL);
2497       ntt_BARRIER(c);
2498    }
2499 }
2500 
2501 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2502 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2503 {
2504    switch (instr->intrinsic) {
2505    case nir_intrinsic_load_ubo:
2506    case nir_intrinsic_load_ubo_vec4:
2507       ntt_emit_load_ubo(c, instr);
2508       break;
2509 
2510       /* Vertex */
2511    case nir_intrinsic_load_vertex_id:
2512    case nir_intrinsic_load_vertex_id_zero_base:
2513    case nir_intrinsic_load_base_vertex:
2514    case nir_intrinsic_load_base_instance:
2515    case nir_intrinsic_load_instance_id:
2516    case nir_intrinsic_load_draw_id:
2517    case nir_intrinsic_load_invocation_id:
2518    case nir_intrinsic_load_frag_coord:
2519    case nir_intrinsic_load_point_coord:
2520    case nir_intrinsic_load_front_face:
2521    case nir_intrinsic_load_sample_id:
2522    case nir_intrinsic_load_sample_pos:
2523    case nir_intrinsic_load_sample_mask_in:
2524    case nir_intrinsic_load_helper_invocation:
2525    case nir_intrinsic_load_tess_coord:
2526    case nir_intrinsic_load_patch_vertices_in:
2527    case nir_intrinsic_load_primitive_id:
2528    case nir_intrinsic_load_tess_level_outer:
2529    case nir_intrinsic_load_tess_level_inner:
2530    case nir_intrinsic_load_local_invocation_id:
2531    case nir_intrinsic_load_workgroup_id:
2532    case nir_intrinsic_load_num_workgroups:
2533    case nir_intrinsic_load_workgroup_size:
2534    case nir_intrinsic_load_subgroup_size:
2535    case nir_intrinsic_load_subgroup_invocation:
2536    case nir_intrinsic_load_subgroup_eq_mask:
2537    case nir_intrinsic_load_subgroup_ge_mask:
2538    case nir_intrinsic_load_subgroup_gt_mask:
2539    case nir_intrinsic_load_subgroup_lt_mask:
2540    case nir_intrinsic_load_subgroup_le_mask:
2541       ntt_emit_load_sysval(c, instr);
2542       break;
2543 
2544    case nir_intrinsic_load_input:
2545    case nir_intrinsic_load_per_vertex_input:
2546    case nir_intrinsic_load_interpolated_input:
2547       ntt_emit_load_input(c, instr);
2548       break;
2549 
2550    case nir_intrinsic_store_output:
2551    case nir_intrinsic_store_per_vertex_output:
2552       ntt_emit_store_output(c, instr);
2553       break;
2554 
2555    case nir_intrinsic_load_output:
2556    case nir_intrinsic_load_per_vertex_output:
2557       ntt_emit_load_output(c, instr);
2558       break;
2559 
2560    case nir_intrinsic_demote:
2561       ntt_DEMOTE(c);
2562       break;
2563 
2564    case nir_intrinsic_discard:
2565       ntt_KILL(c);
2566       break;
2567 
2568    case nir_intrinsic_discard_if: {
2569       struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2570 
2571       if (c->native_integers) {
2572          struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2573          ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2574          ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2575       } else {
2576          /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2577          ntt_KILL_IF(c, ureg_negate(cond));
2578       }
2579       break;
2580    }
2581 
2582    case nir_intrinsic_is_helper_invocation:
2583       ntt_READ_HELPER(c, ntt_get_dest(c, &instr->def));
2584       break;
2585 
2586    case nir_intrinsic_vote_all:
2587       ntt_VOTE_ALL(c, ntt_get_dest(c, &instr->def), ntt_get_src(c,instr->src[0]));
2588       return;
2589    case nir_intrinsic_vote_any:
2590       ntt_VOTE_ANY(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2591       return;
2592    case nir_intrinsic_vote_ieq:
2593       ntt_VOTE_EQ(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2594       return;
2595    case nir_intrinsic_ballot:
2596       ntt_BALLOT(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2597       return;
2598    case nir_intrinsic_read_first_invocation:
2599       ntt_READ_FIRST(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]));
2600       return;
2601    case nir_intrinsic_read_invocation:
2602       ntt_READ_INVOC(c, ntt_get_dest(c, &instr->def), ntt_get_src(c, instr->src[0]), ntt_get_src(c, instr->src[1]));
2603       return;
2604 
2605    case nir_intrinsic_load_ssbo:
2606    case nir_intrinsic_store_ssbo:
2607    case nir_intrinsic_ssbo_atomic:
2608    case nir_intrinsic_ssbo_atomic_swap:
2609    case nir_intrinsic_get_ssbo_size:
2610       ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2611       break;
2612 
2613    case nir_intrinsic_load_shared:
2614    case nir_intrinsic_store_shared:
2615    case nir_intrinsic_shared_atomic:
2616    case nir_intrinsic_shared_atomic_swap:
2617       ntt_emit_mem(c, instr, nir_var_mem_shared);
2618       break;
2619 
2620    case nir_intrinsic_atomic_counter_read:
2621    case nir_intrinsic_atomic_counter_add:
2622    case nir_intrinsic_atomic_counter_inc:
2623    case nir_intrinsic_atomic_counter_post_dec:
2624    case nir_intrinsic_atomic_counter_min:
2625    case nir_intrinsic_atomic_counter_max:
2626    case nir_intrinsic_atomic_counter_and:
2627    case nir_intrinsic_atomic_counter_or:
2628    case nir_intrinsic_atomic_counter_xor:
2629    case nir_intrinsic_atomic_counter_exchange:
2630    case nir_intrinsic_atomic_counter_comp_swap:
2631       ntt_emit_mem(c, instr, nir_var_uniform);
2632       break;
2633    case nir_intrinsic_atomic_counter_pre_dec:
2634       unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2635       break;
2636 
2637    case nir_intrinsic_image_load:
2638    case nir_intrinsic_image_store:
2639    case nir_intrinsic_image_size:
2640    case nir_intrinsic_image_samples:
2641    case nir_intrinsic_image_atomic:
2642    case nir_intrinsic_image_atomic_swap:
2643    case nir_intrinsic_bindless_image_load:
2644    case nir_intrinsic_bindless_image_store:
2645    case nir_intrinsic_bindless_image_size:
2646    case nir_intrinsic_bindless_image_samples:
2647    case nir_intrinsic_bindless_image_atomic:
2648    case nir_intrinsic_bindless_image_atomic_swap:
2649       ntt_emit_image_load_store(c, instr);
2650       break;
2651 
2652    case nir_intrinsic_barrier:
2653       ntt_emit_barrier(c, instr);
2654       break;
2655 
2656    case nir_intrinsic_end_primitive:
2657       ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2658       break;
2659 
2660    case nir_intrinsic_emit_vertex:
2661       ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2662       break;
2663 
2664       /* In TGSI we don't actually generate the barycentric coords, and emit
2665        * interp intrinsics later.  However, we do need to store the
2666        * load_barycentric_at_* argument so that we can use it at that point.
2667        */
2668    case nir_intrinsic_load_barycentric_pixel:
2669    case nir_intrinsic_load_barycentric_centroid:
2670    case nir_intrinsic_load_barycentric_sample:
2671       break;
2672    case nir_intrinsic_load_barycentric_at_sample:
2673    case nir_intrinsic_load_barycentric_at_offset:
2674       ntt_store(c, &instr->def, ntt_get_src(c, instr->src[0]));
2675       break;
2676 
2677    case nir_intrinsic_shader_clock:
2678       ntt_CLOCK(c, ntt_get_dest(c, &instr->def));
2679       break;
2680 
2681    case nir_intrinsic_decl_reg:
2682    case nir_intrinsic_load_reg:
2683    case nir_intrinsic_load_reg_indirect:
2684    case nir_intrinsic_store_reg:
2685    case nir_intrinsic_store_reg_indirect:
2686       /* fully consumed */
2687       break;
2688 
2689    default:
2690       fprintf(stderr, "Unknown intrinsic: ");
2691       nir_print_instr(&instr->instr, stderr);
2692       fprintf(stderr, "\n");
2693       break;
2694    }
2695 }
2696 
2697 struct ntt_tex_operand_state {
2698    struct ureg_src srcs[4];
2699    unsigned i;
2700 };
2701 
2702 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2703 ntt_push_tex_arg(struct ntt_compile *c,
2704                  nir_tex_instr *instr,
2705                  nir_tex_src_type tex_src_type,
2706                  struct ntt_tex_operand_state *s)
2707 {
2708    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2709    if (tex_src < 0)
2710       return;
2711 
2712    nir_src *src = &instr->src[tex_src].src;
2713 
2714    /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2715     * that TG4's immediate offset arg is float-typed.
2716     */
2717    if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2718        nir_src_is_const(*src)) {
2719       nir_const_value *consts = nir_src_as_const_value(*src);
2720       s->srcs[s->i++] = ureg_imm4f(c->ureg,
2721                                    consts[0].f32,
2722                                    consts[1].f32,
2723                                    consts[2].f32,
2724                                    consts[3].f32);
2725       return;
2726    }
2727 
2728    s->srcs[s->i++] = ntt_get_src(c, *src);
2729 }
2730 
2731 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2732 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2733 {
2734    struct ureg_dst dst = ntt_get_dest(c, &instr->def);
2735    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2736    unsigned tex_opcode;
2737 
2738    int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
2739    int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
2740 
2741    struct ureg_src sampler;
2742    if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
2743       /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
2744       sampler = ntt_get_src(c, instr->src[tex_handle_src].src);
2745       assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
2746    } else {
2747       assert(tex_handle_src == -1 && sampler_handle_src == -1);
2748       sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2749       int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2750       if (sampler_src >= 0) {
2751          struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2752          sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2753       }
2754    }
2755 
2756    switch (instr->op) {
2757    case nir_texop_tex:
2758       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2759          MAX2(instr->coord_components, 2) + instr->is_shadow)
2760          tex_opcode = TGSI_OPCODE_TXP;
2761       else
2762          tex_opcode = TGSI_OPCODE_TEX;
2763       break;
2764    case nir_texop_txf:
2765    case nir_texop_txf_ms:
2766       tex_opcode = TGSI_OPCODE_TXF;
2767 
2768       if (c->has_txf_lz) {
2769          int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2770          if (lod_src >= 0 &&
2771              nir_src_is_const(instr->src[lod_src].src) &&
2772              ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2773             tex_opcode = TGSI_OPCODE_TXF_LZ;
2774          }
2775       }
2776       break;
2777    case nir_texop_txl:
2778       tex_opcode = TGSI_OPCODE_TXL;
2779       break;
2780    case nir_texop_txb:
2781       tex_opcode = TGSI_OPCODE_TXB;
2782       break;
2783    case nir_texop_txd:
2784       tex_opcode = TGSI_OPCODE_TXD;
2785       break;
2786    case nir_texop_txs:
2787       tex_opcode = TGSI_OPCODE_TXQ;
2788       break;
2789    case nir_texop_tg4:
2790       tex_opcode = TGSI_OPCODE_TG4;
2791       break;
2792    case nir_texop_query_levels:
2793       tex_opcode = TGSI_OPCODE_TXQ;
2794       break;
2795    case nir_texop_lod:
2796       tex_opcode = TGSI_OPCODE_LODQ;
2797       break;
2798    case nir_texop_texture_samples:
2799       tex_opcode = TGSI_OPCODE_TXQS;
2800       break;
2801    default:
2802       unreachable("unsupported tex op");
2803    }
2804 
2805    struct ntt_tex_operand_state s = { .i = 0 };
2806    ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2807    ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2808 
2809    /* non-coord arg for TXQ */
2810    if (tex_opcode == TGSI_OPCODE_TXQ) {
2811       ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2812       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2813        * scalar
2814        */
2815       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2816    }
2817 
2818    if (s.i > 1) {
2819       if (tex_opcode == TGSI_OPCODE_TEX)
2820          tex_opcode = TGSI_OPCODE_TEX2;
2821       if (tex_opcode == TGSI_OPCODE_TXB)
2822          tex_opcode = TGSI_OPCODE_TXB2;
2823       if (tex_opcode == TGSI_OPCODE_TXL)
2824          tex_opcode = TGSI_OPCODE_TXL2;
2825    }
2826 
2827    if (instr->op == nir_texop_txd) {
2828       /* Derivs appear in their own src args */
2829       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2830       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2831       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2832       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2833    }
2834 
2835    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2836       if (c->screen->get_param(c->screen,
2837                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2838          sampler = ureg_scalar(sampler, instr->component);
2839          s.srcs[s.i++] = ureg_src_undef();
2840       } else {
2841          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2842       }
2843    }
2844 
2845    s.srcs[s.i++] = sampler;
2846 
2847    enum tgsi_return_type tex_type;
2848    switch (instr->dest_type) {
2849    case nir_type_float32:
2850       tex_type = TGSI_RETURN_TYPE_FLOAT;
2851       break;
2852    case nir_type_int32:
2853       tex_type = TGSI_RETURN_TYPE_SINT;
2854       break;
2855    case nir_type_uint32:
2856       tex_type = TGSI_RETURN_TYPE_UINT;
2857       break;
2858    default:
2859       unreachable("unknown texture type");
2860    }
2861 
2862    struct ureg_dst tex_dst;
2863    if (instr->op == nir_texop_query_levels)
2864       tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2865    else
2866       tex_dst = dst;
2867 
2868    while (s.i < 4)
2869       s.srcs[s.i++] = ureg_src_undef();
2870 
2871    struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2872    insn->tex_target = target;
2873    insn->tex_return_type = tex_type;
2874    insn->is_tex = true;
2875 
2876    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2877    if (tex_offset_src >= 0) {
2878       struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2879 
2880       insn->tex_offset[0].File = offset.File;
2881       insn->tex_offset[0].Index = offset.Index;
2882       insn->tex_offset[0].SwizzleX = offset.SwizzleX;
2883       insn->tex_offset[0].SwizzleY = offset.SwizzleY;
2884       insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
2885       insn->tex_offset[0].Padding = 0;
2886    }
2887 
2888    if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
2889       for (uint8_t i = 0; i < 4; ++i) {
2890          struct ureg_src imm = ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
2891          insn->tex_offset[i].File = imm.File;
2892          insn->tex_offset[i].Index = imm.Index;
2893          insn->tex_offset[i].SwizzleX = imm.SwizzleX;
2894          insn->tex_offset[i].SwizzleY = imm.SwizzleY;
2895          insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
2896       }
2897    }
2898 
2899    if (instr->op == nir_texop_query_levels)
2900       ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2901 }
2902 
2903 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2904 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2905 {
2906    switch (jump->type) {
2907    case nir_jump_break:
2908       ntt_BRK(c);
2909       break;
2910 
2911    case nir_jump_continue:
2912       ntt_CONT(c);
2913       break;
2914 
2915    default:
2916       fprintf(stderr, "Unknown jump instruction: ");
2917       nir_print_instr(&jump->instr, stderr);
2918       fprintf(stderr, "\n");
2919       abort();
2920    }
2921 }
2922 
2923 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_undef_instr * instr)2924 ntt_emit_ssa_undef(struct ntt_compile *c, nir_undef_instr *instr)
2925 {
2926    /* Nothing to do but make sure that we have some storage to deref. */
2927    (void)ntt_get_ssa_def_decl(c, &instr->def);
2928 }
2929 
2930 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2931 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2932 {
2933    switch (instr->type) {
2934    case nir_instr_type_deref:
2935       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2936       break;
2937 
2938    case nir_instr_type_alu:
2939       ntt_emit_alu(c, nir_instr_as_alu(instr));
2940       break;
2941 
2942    case nir_instr_type_intrinsic:
2943       ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2944       break;
2945 
2946    case nir_instr_type_load_const:
2947       /* Nothing to do here, as load consts are done directly from
2948        * ntt_get_src() (since many constant NIR srcs will often get folded
2949        * directly into a register file index instead of as a TGSI src).
2950        */
2951       break;
2952 
2953    case nir_instr_type_tex:
2954       ntt_emit_texture(c, nir_instr_as_tex(instr));
2955       break;
2956 
2957    case nir_instr_type_jump:
2958       ntt_emit_jump(c, nir_instr_as_jump(instr));
2959       break;
2960 
2961    case nir_instr_type_undef:
2962       ntt_emit_ssa_undef(c, nir_instr_as_undef(instr));
2963       break;
2964 
2965    default:
2966       fprintf(stderr, "Unknown NIR instr type: ");
2967       nir_print_instr(instr, stderr);
2968       fprintf(stderr, "\n");
2969       abort();
2970    }
2971 }
2972 
2973 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2974 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2975 {
2976    if (c->native_integers)
2977       ntt_UIF(c, c->if_cond);
2978    else
2979       ntt_IF(c, c->if_cond);
2980 
2981    ntt_emit_cf_list(c, &if_stmt->then_list);
2982 
2983    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2984       ntt_ELSE(c);
2985       ntt_emit_cf_list(c, &if_stmt->else_list);
2986    }
2987 
2988    ntt_ENDIF(c);
2989 }
2990 
2991 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2992 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2993 {
2994    assert(!nir_loop_has_continue_construct(loop));
2995    ntt_BGNLOOP(c);
2996    ntt_emit_cf_list(c, &loop->body);
2997    ntt_ENDLOOP(c);
2998 }
2999 
3000 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)3001 ntt_emit_block(struct ntt_compile *c, nir_block *block)
3002 {
3003    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3004    c->cur_block = ntt_block;
3005 
3006    nir_foreach_instr(instr, block) {
3007       ntt_emit_instr(c, instr);
3008 
3009       /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
3010       if (ureg_get_instruction_number(c->ureg) != 0) {
3011          fprintf(stderr, "Emitted ureg insn during: ");
3012          nir_print_instr(instr, stderr);
3013          fprintf(stderr, "\n");
3014          unreachable("emitted ureg insn");
3015       }
3016    }
3017 
3018    /* Set up the if condition for ntt_emit_if(), which we have to do before
3019     * freeing up the temps (the "if" is treated as inside the block for liveness
3020     * purposes, despite not being an instruction)
3021     *
3022     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
3023     * looks at all of .xyzw.  No harm in working around the bug.
3024     */
3025    nir_if *nif = nir_block_get_following_if(block);
3026    if (nif)
3027       c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
3028 }
3029 
3030 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)3031 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
3032 {
3033    foreach_list_typed(nir_cf_node, node, node, list) {
3034       switch (node->type) {
3035       case nir_cf_node_block:
3036          ntt_emit_block(c, nir_cf_node_as_block(node));
3037          break;
3038 
3039       case nir_cf_node_if:
3040          ntt_emit_if(c, nir_cf_node_as_if(node));
3041          break;
3042 
3043       case nir_cf_node_loop:
3044          ntt_emit_loop(c, nir_cf_node_as_loop(node));
3045          break;
3046 
3047       default:
3048          unreachable("unknown CF type");
3049       }
3050    }
3051 }
3052 
3053 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)3054 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
3055 {
3056    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
3057 
3058    /* Emit the ntt insns to tgsi_ureg. */
3059    util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
3060       const struct tgsi_opcode_info *opcode_info =
3061          tgsi_get_opcode_info(insn->opcode);
3062 
3063       switch (insn->opcode) {
3064       case TGSI_OPCODE_UIF:
3065          ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
3066          break;
3067 
3068       case TGSI_OPCODE_IF:
3069          ureg_IF(c->ureg, insn->src[0], &c->cf_label);
3070          break;
3071 
3072       case TGSI_OPCODE_ELSE:
3073          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3074          ureg_ELSE(c->ureg, &c->cf_label);
3075          c->current_if_else = c->cf_label;
3076          break;
3077 
3078       case TGSI_OPCODE_ENDIF:
3079          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
3080          ureg_ENDIF(c->ureg);
3081          break;
3082 
3083       case TGSI_OPCODE_BGNLOOP:
3084          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3085           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3086           * with a need.
3087           */
3088          ureg_BGNLOOP(c->ureg, &c->cf_label);
3089          break;
3090 
3091       case TGSI_OPCODE_ENDLOOP:
3092          ureg_ENDLOOP(c->ureg, &c->cf_label);
3093          break;
3094 
3095       default:
3096          if (insn->is_tex) {
3097             int num_offsets = 0;
3098             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
3099                if (insn->tex_offset[i].File != TGSI_FILE_NULL)
3100                   num_offsets = i + 1;
3101             }
3102             ureg_tex_insn(c->ureg, insn->opcode,
3103                           insn->dst, opcode_info->num_dst,
3104                           insn->tex_target, insn->tex_return_type,
3105                           insn->tex_offset,
3106                           num_offsets,
3107                           insn->src, opcode_info->num_src);
3108          } else if (insn->is_mem) {
3109             ureg_memory_insn(c->ureg, insn->opcode,
3110                              insn->dst, opcode_info->num_dst,
3111                              insn->src, opcode_info->num_src,
3112                              insn->mem_qualifier,
3113                              insn->tex_target,
3114                              insn->mem_format);
3115          } else {
3116             ureg_insn(c->ureg, insn->opcode,
3117                      insn->dst, opcode_info->num_dst,
3118                      insn->src, opcode_info->num_src,
3119                      insn->precise);
3120          }
3121       }
3122    }
3123 }
3124 
3125 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)3126 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
3127 {
3128    /* Note: the last block emitted our IF opcode. */
3129 
3130    int if_stack = c->current_if_else;
3131    c->current_if_else = c->cf_label;
3132 
3133    /* Either the then or else block includes the ENDIF, which will fix up the
3134     * IF(/ELSE)'s label for jumping
3135     */
3136    ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
3137    ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
3138 
3139    c->current_if_else = if_stack;
3140 }
3141 
3142 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)3143 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
3144 {
3145    foreach_list_typed(nir_cf_node, node, node, list) {
3146       switch (node->type) {
3147       case nir_cf_node_block:
3148          ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
3149          break;
3150 
3151       case nir_cf_node_if:
3152          ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3153          break;
3154 
3155       case nir_cf_node_loop:
3156          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3157           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3158           * with a need.
3159           */
3160          ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3161          break;
3162 
3163       default:
3164          unreachable("unknown CF type");
3165       }
3166    }
3167 }
3168 
3169 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3170 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3171 {
3172    c->impl = impl;
3173 
3174    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3175    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
3176 
3177    /* Set up the struct ntt_blocks to put insns in */
3178    c->blocks = _mesa_pointer_hash_table_create(c);
3179    nir_foreach_block(block, impl) {
3180       struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3181       util_dynarray_init(&ntt_block->insns, ntt_block);
3182       _mesa_hash_table_insert(c->blocks, block, ntt_block);
3183    }
3184 
3185 
3186    ntt_setup_registers(c);
3187 
3188    c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3189    ntt_setup_inputs(c);
3190    ntt_setup_outputs(c);
3191    ntt_setup_uniforms(c);
3192 
3193    /* Emit the ntt insns */
3194    ntt_emit_cf_list(c, &impl->body);
3195 
3196    /* Don't do optimized RA if the driver requests it, unless the number of
3197     * temps is too large to be covered by the 16 bit signed int that TGSI
3198     * allocates for the register index */
3199    if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3200       ntt_allocate_regs(c, impl);
3201    else
3202       ntt_allocate_regs_unoptimized(c, impl);
3203 
3204    /* Turn the ntt insns into actual TGSI tokens */
3205    ntt_emit_cf_list_ureg(c, &impl->body);
3206 
3207    ralloc_free(c->liveness);
3208    c->liveness = NULL;
3209 
3210 }
3211 
3212 static int
type_size(const struct glsl_type * type,bool bindless)3213 type_size(const struct glsl_type *type, bool bindless)
3214 {
3215    return glsl_count_attribute_slots(type, false);
3216 }
3217 
3218 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3219  * can handle for 64-bit values in TGSI.
3220  */
3221 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3222 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3223 {
3224    if (instr->type != nir_instr_type_alu)
3225       return 0;
3226 
3227    nir_alu_instr *alu = nir_instr_as_alu(instr);
3228 
3229    switch (alu->op) {
3230    case nir_op_ibitfield_extract:
3231    case nir_op_ubitfield_extract:
3232    case nir_op_bitfield_insert:
3233       /* virglrenderer only looks at the .x channel of the offset/bits operands
3234        * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
3235        * offset/bits operands.
3236        *
3237        * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3238        */
3239       return 1;
3240 
3241    default:
3242       break;
3243    }
3244 
3245    int src_bit_size = nir_src_bit_size(alu->src[0].src);
3246    int dst_bit_size = alu->def.bit_size;
3247 
3248    if (src_bit_size == 64 || dst_bit_size == 64) {
3249       /* Avoid vectorizing 64-bit instructions at all.  Despite tgsi.rst
3250        * claiming support, virglrenderer generates bad shaders on the host when
3251        * presented with them.  Maybe we can make virgl avoid tickling the
3252        * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3253        * 64-bit instrs in the first place, I don't see much reason to care about
3254        * this.
3255        */
3256       return 1;
3257    }
3258 
3259    return 4;
3260 }
3261 
3262 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3263 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3264                         unsigned num_components, unsigned high_offset,
3265                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3266                         void *data)
3267 {
3268    if (bit_size != 32)
3269       return false;
3270 
3271    /* Our offset alignment should aways be at least 4 bytes */
3272    if (align < 4)
3273       return false;
3274 
3275    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
3276     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
3277     * need this restriction.
3278     */
3279    unsigned worst_start_component = align == 4 ? 3 : align / 4;
3280    if (worst_start_component + num_components > 4)
3281       return false;
3282 
3283    return true;
3284 }
3285 
3286 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3287 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3288 {
3289    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3290    unsigned indirect_mask = 0;
3291 
3292    if (!screen->get_shader_param(screen, pipe_stage,
3293                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3294       indirect_mask |= nir_var_shader_in;
3295    }
3296 
3297    if (!screen->get_shader_param(screen, pipe_stage,
3298                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3299       indirect_mask |= nir_var_shader_out;
3300    }
3301 
3302    if (!screen->get_shader_param(screen, pipe_stage,
3303                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3304       indirect_mask |= nir_var_function_temp;
3305    }
3306 
3307    return indirect_mask;
3308 }
3309 
3310 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3311 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen,
3312                  const struct nir_to_tgsi_options *options)
3313 {
3314    bool progress;
3315    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3316    unsigned control_flow_depth =
3317       screen->get_shader_param(screen, pipe_stage,
3318                                PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3319    do {
3320       progress = false;
3321 
3322       NIR_PASS_V(s, nir_lower_vars_to_ssa);
3323       NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3324 
3325       NIR_PASS(progress, s, nir_copy_prop);
3326       NIR_PASS(progress, s, nir_opt_algebraic);
3327       NIR_PASS(progress, s, nir_opt_constant_folding);
3328       NIR_PASS(progress, s, nir_opt_remove_phis);
3329       NIR_PASS(progress, s, nir_opt_conditional_discard);
3330       NIR_PASS(progress, s, nir_opt_dce);
3331       NIR_PASS(progress, s, nir_opt_dead_cf);
3332       NIR_PASS(progress, s, nir_opt_cse);
3333       NIR_PASS(progress, s, nir_opt_find_array_copies);
3334       NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3335       NIR_PASS(progress, s, nir_opt_dead_write_vars);
3336 
3337       NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
3338       NIR_PASS(progress, s, nir_opt_peephole_select,
3339                control_flow_depth == 0 ? ~0 : 8, true, true);
3340       NIR_PASS(progress, s, nir_opt_algebraic);
3341       NIR_PASS(progress, s, nir_opt_constant_folding);
3342       nir_load_store_vectorize_options vectorize_opts = {
3343          .modes = nir_var_mem_ubo,
3344          .callback = ntt_should_vectorize_io,
3345          .robust_modes = 0,
3346       };
3347       NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3348       NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3349       NIR_PASS(progress, s, nir_opt_shrink_vectors);
3350       NIR_PASS(progress, s, nir_opt_loop);
3351       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3352       NIR_PASS(progress, s, nir_opt_undef);
3353       NIR_PASS(progress, s, nir_opt_loop_unroll);
3354 
3355       /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3356        * and ALU ops for it.
3357        */
3358       nir_opt_offsets_options offset_options = {
3359          .ubo_vec4_max = ~0,
3360 
3361          /* No const offset in TGSI for shared accesses. */
3362          .shared_max = 0,
3363 
3364          /* unused intrinsics */
3365          .uniform_max = 0,
3366          .buffer_max = 0,
3367       };
3368 
3369       if (options->ubo_vec4_max)
3370          offset_options.ubo_vec4_max = options->ubo_vec4_max;
3371 
3372       NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3373    } while (progress);
3374 
3375    NIR_PASS_V(s, nir_lower_var_copies);
3376 }
3377 
3378 /* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
3379  * scalarize vec3/vec4s, should probably fix that.
3380  */
3381 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3382 scalarize_64bit(const nir_instr *instr, const void *data)
3383 {
3384    const nir_alu_instr *alu = nir_instr_as_alu(instr);
3385 
3386    return (alu->def.bit_size == 64 ||
3387            nir_src_bit_size(alu->src[0].src) == 64);
3388 }
3389 
3390 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3391 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3392 {
3393    b->cursor = nir_after_instr(&instr->instr);
3394 
3395    switch (instr->intrinsic) {
3396    case nir_intrinsic_load_ubo:
3397    case nir_intrinsic_load_ubo_vec4:
3398    case nir_intrinsic_load_ssbo:
3399    case nir_intrinsic_load_input:
3400    case nir_intrinsic_load_interpolated_input:
3401    case nir_intrinsic_load_per_vertex_input:
3402    case nir_intrinsic_store_output:
3403    case nir_intrinsic_store_per_vertex_output:
3404    case nir_intrinsic_store_ssbo:
3405       break;
3406    default:
3407       return false;
3408    }
3409 
3410    if (instr->num_components <= 2)
3411       return false;
3412 
3413    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3414    if (has_dest) {
3415       if (instr->def.bit_size != 64)
3416          return false;
3417    } else  {
3418       if (nir_src_bit_size(instr->src[0]) != 64)
3419           return false;
3420    }
3421 
3422    nir_intrinsic_instr *first =
3423       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3424    nir_intrinsic_instr *second =
3425       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3426 
3427    switch (instr->intrinsic) {
3428    case nir_intrinsic_load_ubo:
3429    case nir_intrinsic_load_ubo_vec4:
3430    case nir_intrinsic_load_ssbo:
3431    case nir_intrinsic_store_ssbo:
3432       break;
3433 
3434    default: {
3435       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3436       semantics.location++;
3437       semantics.num_slots--;
3438       nir_intrinsic_set_io_semantics(second, semantics);
3439 
3440       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3441       break;
3442    }
3443    }
3444 
3445    first->num_components = 2;
3446    second->num_components -= 2;
3447    if (has_dest) {
3448       first->def.num_components = 2;
3449       second->def.num_components -= 2;
3450    }
3451 
3452    nir_builder_instr_insert(b, &first->instr);
3453    nir_builder_instr_insert(b, &second->instr);
3454 
3455    if (has_dest) {
3456       /* Merge the two loads' results back into a vector. */
3457       nir_scalar channels[4] = {
3458          nir_get_scalar(&first->def, 0),
3459          nir_get_scalar(&first->def, 1),
3460          nir_get_scalar(&second->def, 0),
3461          nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
3462       };
3463       nir_def *new = nir_vec_scalars(b, channels, instr->num_components);
3464       nir_def_rewrite_uses(&instr->def, new);
3465    } else {
3466       /* Split the src value across the two stores. */
3467       b->cursor = nir_before_instr(&instr->instr);
3468 
3469       nir_def *src0 = instr->src[0].ssa;
3470       nir_scalar channels[4] = { 0 };
3471       for (int i = 0; i < instr->num_components; i++)
3472          channels[i] = nir_get_scalar(src0, i);
3473 
3474       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3475       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3476 
3477       nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
3478       nir_src_rewrite(&second->src[0],
3479                       nir_vec_scalars(b, &channels[2], second->num_components));
3480    }
3481 
3482    int offset_src = -1;
3483    uint32_t offset_amount = 16;
3484 
3485    switch (instr->intrinsic) {
3486    case nir_intrinsic_load_ssbo:
3487    case nir_intrinsic_load_ubo:
3488       offset_src = 1;
3489       break;
3490    case nir_intrinsic_load_ubo_vec4:
3491       offset_src = 1;
3492       offset_amount = 1;
3493       break;
3494    case nir_intrinsic_store_ssbo:
3495       offset_src = 2;
3496       break;
3497    default:
3498       break;
3499    }
3500    if (offset_src != -1) {
3501       b->cursor = nir_before_instr(&second->instr);
3502       nir_def *second_offset =
3503          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3504       nir_src_rewrite(&second->src[offset_src], second_offset);
3505    }
3506 
3507    /* DCE stores we generated with no writemask (nothing else does this
3508     * currently).
3509     */
3510    if (!has_dest) {
3511       if (nir_intrinsic_write_mask(first) == 0)
3512          nir_instr_remove(&first->instr);
3513       if (nir_intrinsic_write_mask(second) == 0)
3514          nir_instr_remove(&second->instr);
3515    }
3516 
3517    nir_instr_remove(&instr->instr);
3518 
3519    return true;
3520 }
3521 
3522 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3523 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3524 {
3525    int num_components = instr->def.num_components;
3526 
3527    if (instr->def.bit_size != 64 || num_components <= 2)
3528       return false;
3529 
3530    b->cursor = nir_before_instr(&instr->instr);
3531 
3532    nir_load_const_instr *first =
3533       nir_load_const_instr_create(b->shader, 2, 64);
3534    nir_load_const_instr *second =
3535       nir_load_const_instr_create(b->shader, num_components - 2, 64);
3536 
3537    first->value[0] = instr->value[0];
3538    first->value[1] = instr->value[1];
3539    second->value[0] = instr->value[2];
3540    if (num_components == 4)
3541       second->value[1] = instr->value[3];
3542 
3543    nir_builder_instr_insert(b, &first->instr);
3544    nir_builder_instr_insert(b, &second->instr);
3545 
3546    nir_def *channels[4] = {
3547       nir_channel(b, &first->def, 0),
3548       nir_channel(b, &first->def, 1),
3549       nir_channel(b, &second->def, 0),
3550       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3551    };
3552    nir_def *new = nir_vec(b, channels, num_components);
3553    nir_def_rewrite_uses(&instr->def, new);
3554    nir_instr_remove(&instr->instr);
3555 
3556    return true;
3557 }
3558 
3559 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3560 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3561                                       void *data)
3562 {
3563    switch (instr->type) {
3564    case nir_instr_type_load_const:
3565       return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3566 
3567    case nir_instr_type_intrinsic:
3568       return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3569    default:
3570       return false;
3571    }
3572 }
3573 
3574 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3575 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3576 {
3577    return nir_shader_instructions_pass(s,
3578                                        nir_to_tgsi_lower_64bit_to_vec2_instr,
3579                                        nir_metadata_block_index |
3580                                        nir_metadata_dominance,
3581                                        NULL);
3582 }
3583 
3584 struct ntt_lower_tex_state {
3585    nir_scalar channels[8];
3586    unsigned i;
3587 };
3588 
3589 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3590 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3591                                 nir_tex_instr *instr,
3592                                 nir_tex_src_type tex_src_type,
3593                                 struct ntt_lower_tex_state *s)
3594 {
3595    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3596    if (tex_src < 0)
3597       return;
3598 
3599    nir_def *def = instr->src[tex_src].src.ssa;
3600    for (int i = 0; i < def->num_components; i++) {
3601       s->channels[s->i++] = nir_get_scalar(def, i);
3602    }
3603 
3604    nir_tex_instr_remove_src(instr, tex_src);
3605 }
3606 
3607 /**
3608  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3609  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
3610  * manage it on our own, and may lead to more vectorization.
3611  */
3612 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3613 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3614 {
3615    if (instr->type != nir_instr_type_tex)
3616       return false;
3617 
3618    nir_tex_instr *tex = nir_instr_as_tex(instr);
3619 
3620    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3621       return false;
3622 
3623    b->cursor = nir_before_instr(instr);
3624 
3625    struct ntt_lower_tex_state s = {0};
3626 
3627    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3628    /* We always have at least two slots for the coordinate, even on 1D. */
3629    s.i = MAX2(s.i, 2);
3630 
3631    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3632    s.i = MAX2(s.i, 3);
3633 
3634    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3635 
3636    /* XXX: LZ */
3637    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3638    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3639    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3640 
3641    /* No need to pack undefs in unused channels of the tex instr */
3642    while (!s.channels[s.i - 1].def)
3643       s.i--;
3644 
3645    /* Instead of putting undefs in the unused slots of the vecs, just put in
3646     * another used channel.  Otherwise, we'll get unnecessary moves into
3647     * registers.
3648     */
3649    assert(s.channels[0].def != NULL);
3650    for (int i = 1; i < s.i; i++) {
3651       if (!s.channels[i].def)
3652          s.channels[i] = s.channels[0];
3653    }
3654 
3655    nir_tex_instr_add_src(tex, nir_tex_src_backend1,
3656                          nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
3657    if (s.i > 4)
3658       nir_tex_instr_add_src(tex, nir_tex_src_backend2,
3659                             nir_vec_scalars(b, &s.channels[4], s.i - 4));
3660 
3661    return true;
3662 }
3663 
3664 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3665 nir_to_tgsi_lower_tex(nir_shader *s)
3666 {
3667    return nir_shader_instructions_pass(s,
3668                                        nir_to_tgsi_lower_tex_instr,
3669                                        nir_metadata_block_index |
3670                                        nir_metadata_dominance,
3671                                        NULL);
3672 }
3673 
3674 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3675 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3676                     const struct nir_to_tgsi_options *ntt_options)
3677 {
3678    const struct nir_shader_compiler_options *options = s->options;
3679    bool lower_fsqrt =
3680       !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3681                                 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3682 
3683    bool force_indirect_unrolling_sampler =
3684       screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3685 
3686    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3687 
3688    if (!options->lower_extract_byte ||
3689        !options->lower_extract_word ||
3690        !options->lower_insert_byte ||
3691        !options->lower_insert_word ||
3692        !options->lower_fdph ||
3693        !options->lower_flrp64 ||
3694        !options->lower_fmod ||
3695        !options->lower_uadd_carry ||
3696        !options->lower_usub_borrow ||
3697        !options->lower_uadd_sat ||
3698        !options->lower_usub_sat ||
3699        !options->lower_uniforms_to_ubo ||
3700        !options->lower_vector_cmp ||
3701        options->has_rotate8 ||
3702        options->has_rotate16 ||
3703        options->has_rotate32 ||
3704        options->lower_fsqrt != lower_fsqrt ||
3705        options->force_indirect_unrolling != no_indirects_mask ||
3706        force_indirect_unrolling_sampler) {
3707       nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3708       *new_options = *s->options;
3709 
3710       new_options->lower_extract_byte = true;
3711       new_options->lower_extract_word = true;
3712       new_options->lower_insert_byte = true;
3713       new_options->lower_insert_word = true;
3714       new_options->lower_fdph = true;
3715       new_options->lower_flrp64 = true;
3716       new_options->lower_fmod = true;
3717       new_options->lower_uadd_carry = true;
3718       new_options->lower_usub_borrow = true;
3719       new_options->lower_uadd_sat = true;
3720       new_options->lower_usub_sat = true;
3721       new_options->lower_uniforms_to_ubo = true;
3722       new_options->lower_vector_cmp = true;
3723       new_options->lower_fsqrt = lower_fsqrt;
3724       new_options->has_rotate8 = false;
3725       new_options->has_rotate16 = false;
3726       new_options->has_rotate32 = false;
3727       new_options->force_indirect_unrolling = no_indirects_mask;
3728       new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3729 
3730       s->options = new_options;
3731    }
3732 }
3733 
3734 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3735 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3736 {
3737    return (instr->type == nir_instr_type_intrinsic &&
3738            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3739 }
3740 
3741 static nir_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3742 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3743 {
3744    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3745 
3746    nir_def *old_result = &intr->def;
3747    intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3748 
3749    return nir_iadd_imm(b, old_result, -1);
3750 }
3751 
3752 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3753 ntt_lower_atomic_pre_dec(nir_shader *s)
3754 {
3755    return nir_shader_lower_instructions(s,
3756                                         ntt_lower_atomic_pre_dec_filter,
3757                                         ntt_lower_atomic_pre_dec_lower, NULL);
3758 }
3759 
3760 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3761 static void
nir_to_tgsi_lower_txp(nir_shader * s)3762 nir_to_tgsi_lower_txp(nir_shader *s)
3763 {
3764    nir_lower_tex_options lower_tex_options = {
3765        .lower_txp = 0,
3766    };
3767 
3768    nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3769       nir_foreach_instr(instr, block) {
3770          if (instr->type != nir_instr_type_tex)
3771             continue;
3772          nir_tex_instr *tex = nir_instr_as_tex(instr);
3773 
3774          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3775             continue;
3776 
3777          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3778          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3779          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3780 
3781          /* We can do TXP for any tex (not txg) where we can fit all the
3782           * coordinates and comparator and projector in one vec4 without any
3783           * other modifiers to add on.
3784           *
3785           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3786           * if we get any funny projectors then we just blow them all away.
3787           */
3788          if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3789             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3790       }
3791    }
3792 
3793    /* nir_lower_tex must be run even if no options are set, because we need the
3794     * LOD to be set for query_levels and for non-fragment shaders.
3795     */
3796    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3797 }
3798 
3799 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3800 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3801 {
3802    return (instr->type == nir_instr_type_intrinsic &&
3803            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3804 }
3805 
3806 static nir_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3807 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3808 {
3809    nir_variable *var = nir_get_variable_with_location(b->shader, nir_var_shader_in,
3810                                                       VARYING_SLOT_PRIMITIVE_ID, glsl_uint_type());
3811 
3812    nir_io_semantics semantics = {
3813       .location = var->data.location,
3814        .num_slots = 1
3815    };
3816    return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3817                          .base = var->data.driver_location,
3818                          .io_semantics = semantics);
3819 }
3820 
3821 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3822 nir_lower_primid_sysval_to_input(nir_shader *s)
3823 {
3824    return nir_shader_lower_instructions(s,
3825                                         nir_lower_primid_sysval_to_input_filter,
3826                                         nir_lower_primid_sysval_to_input_lower, NULL);
3827 }
3828 
3829 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3830 nir_to_tgsi(struct nir_shader *s,
3831             struct pipe_screen *screen)
3832 {
3833    static const struct nir_to_tgsi_options default_ntt_options = {0};
3834    return nir_to_tgsi_options(s, screen, &default_ntt_options);
3835 }
3836 
3837 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3838  * into unsupported channels of registers.
3839  */
3840 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3841 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3842 {
3843    if (instr->type != nir_instr_type_alu)
3844       return false;
3845 
3846    nir_alu_instr *alu = nir_instr_as_alu(instr);
3847    int dst_32 = alu->def.bit_size == 32;
3848    int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3849 
3850    if (src_64 && dst_32) {
3851       int num_srcs = nir_op_infos[alu->op].num_inputs;
3852 
3853       if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3854          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3855           * instead of .xy.  Just support scalar compares storing to .x,
3856           * GLSL-to-TGSI only ever emitted scalar ops anyway.
3857           */
3858         if (writemask != TGSI_WRITEMASK_X)
3859            return false;
3860       } else {
3861          /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3862           * register can only store a dvec2).  Don't try to coalesce to write to
3863           * .zw.
3864           */
3865          if (writemask & ~(TGSI_WRITEMASK_XY))
3866             return false;
3867       }
3868    }
3869 
3870    return true;
3871 }
3872 
3873 /**
3874  * Translates the NIR shader to TGSI.
3875  *
3876  * This requires some lowering of the NIR shader to prepare it for translation.
3877  * We take ownership of the NIR shader passed, returning a reference to the new
3878  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
3879  */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3880 const void *nir_to_tgsi_options(struct nir_shader *s,
3881                                 struct pipe_screen *screen,
3882                                 const struct nir_to_tgsi_options *options)
3883 {
3884    struct ntt_compile *c;
3885    const void *tgsi_tokens;
3886    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3887    bool native_integers = screen->get_shader_param(screen,
3888                                                    pipe_shader_type_from_mesa(s->info.stage),
3889                                                    PIPE_SHADER_CAP_INTEGERS);
3890    const struct nir_shader_compiler_options *original_options = s->options;
3891 
3892    ntt_fix_nir_options(screen, s, options);
3893 
3894    /* Lower array indexing on FS inputs.  Since we don't set
3895     * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3896     * elements by ureg, and so dynamically indexing them would be invalid.
3897     * Ideally we would set that ureg flag based on
3898     * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3899     * splitting NIR VS outputs to elements even if the FS doesn't get the
3900     * corresponding splitting, and virgl depends on TGSI across link boundaries
3901     * having matching declarations.
3902     */
3903    if (s->info.stage == MESA_SHADER_FRAGMENT) {
3904       NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3905       NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3906    }
3907 
3908    /* Lower tesslevel indirect derefs for tessellation shader.
3909     * tesslevels are now a compact array variable and nir expects a constant
3910     * array index into the compact array variable.
3911     */
3912    if (s->info.stage == MESA_SHADER_TESS_CTRL ||
3913        s->info.stage == MESA_SHADER_TESS_EVAL) {
3914       NIR_PASS_V(s, nir_lower_indirect_derefs, 0 , UINT32_MAX);
3915    }
3916 
3917    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3918               type_size, (nir_lower_io_options)0);
3919 
3920    nir_to_tgsi_lower_txp(s);
3921    NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3922 
3923    /* While TGSI can represent PRIMID as either an input or a system value,
3924     * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3925     * depend on that.
3926     */
3927    if (s->info.stage == MESA_SHADER_GEOMETRY)
3928       NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3929 
3930    if (s->info.num_abos)
3931       NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3932 
3933    if (!original_options->lower_uniforms_to_ubo) {
3934       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3935                  screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3936                  !native_integers);
3937    }
3938 
3939    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3940     * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3941     * duplication logic we just make it so that we only see vec2s.
3942     */
3943    NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3944    NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3945 
3946    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3947       NIR_PASS_V(s, nir_lower_ubo_vec4);
3948 
3949    ntt_optimize_nir(s, screen, options);
3950 
3951    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3952 
3953    /* Lower demote_if to if (cond) { demote } because TGSI doesn't have a DEMOTE_IF. */
3954    NIR_PASS_V(s, nir_lower_discard_if, nir_lower_demote_if_to_cf);
3955 
3956    NIR_PASS_V(s, nir_lower_frexp);
3957 
3958    bool progress;
3959    do {
3960       progress = false;
3961       NIR_PASS(progress, s, nir_opt_algebraic_late);
3962       if (progress) {
3963          NIR_PASS_V(s, nir_copy_prop);
3964          NIR_PASS_V(s, nir_opt_dce);
3965          NIR_PASS_V(s, nir_opt_cse);
3966       }
3967    } while (progress);
3968 
3969    NIR_PASS_V(s, nir_opt_combine_barriers, NULL, NULL);
3970 
3971    if (screen->get_shader_param(screen,
3972                                 pipe_shader_type_from_mesa(s->info.stage),
3973                                 PIPE_SHADER_CAP_INTEGERS)) {
3974       NIR_PASS_V(s, nir_lower_bool_to_int32);
3975    } else {
3976       NIR_PASS_V(s, nir_lower_int_to_float);
3977       NIR_PASS_V(s, nir_lower_bool_to_float,
3978                  !options->lower_cmp && !options->lower_fabs);
3979       /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3980       NIR_PASS_V(s, nir_copy_prop);
3981       NIR_PASS_V(s, nir_opt_dce);
3982    }
3983 
3984    nir_move_options move_all =
3985        nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3986        nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3987 
3988    NIR_PASS_V(s, nir_opt_move, move_all);
3989 
3990    NIR_PASS_V(s, nir_convert_from_ssa, true);
3991    NIR_PASS_V(s, nir_lower_vec_to_regs, ntt_vec_to_mov_writemask_cb, NULL);
3992 
3993    /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
3994     */
3995    NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
3996    NIR_PASS_V(s, nir_opt_dce);
3997 
3998    /* See comment in ntt_get_alu_src for supported modifiers */
3999    NIR_PASS_V(s, nir_legacy_trivialize, !options->lower_fabs);
4000 
4001    if (NIR_DEBUG(TGSI)) {
4002       fprintf(stderr, "NIR before translation to TGSI:\n");
4003       nir_print_shader(s, stderr);
4004    }
4005 
4006    c = rzalloc(NULL, struct ntt_compile);
4007    c->screen = screen;
4008    c->options = options;
4009 
4010    c->needs_texcoord_semantic =
4011       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
4012    c->has_txf_lz =
4013       screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
4014 
4015    c->s = s;
4016    c->native_integers = native_integers;
4017    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
4018    ureg_setup_shader_info(c->ureg, &s->info);
4019    if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
4020       ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
4021 
4022    if (s->info.stage == MESA_SHADER_FRAGMENT) {
4023       /* The draw module's polygon stipple layer doesn't respect the chosen
4024        * coordinate mode, so leave it as unspecified unless we're actually
4025        * reading the position in the shader already.  See
4026        * gl-2.1-polygon-stipple-fs on softpipe.
4027        */
4028       if ((s->info.inputs_read & VARYING_BIT_POS) ||
4029           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4030          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
4031                        s->info.fs.origin_upper_left ?
4032                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
4033                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
4034 
4035          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
4036                        s->info.fs.pixel_center_integer ?
4037                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
4038                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
4039       }
4040    }
4041    /* Emit the main function */
4042    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
4043    ntt_emit_impl(c, impl);
4044    ureg_END(c->ureg);
4045 
4046    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
4047 
4048    if (NIR_DEBUG(TGSI)) {
4049       fprintf(stderr, "TGSI after translation from NIR:\n");
4050       tgsi_dump(tgsi_tokens, 0);
4051    }
4052 
4053    ureg_destroy(c->ureg);
4054 
4055    ralloc_free(c);
4056    ralloc_free(s);
4057 
4058    return tgsi_tokens;
4059 }
4060 
4061 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
4062    .fdot_replicates = true,
4063    .fuse_ffma32 = true,
4064    .fuse_ffma64 = true,
4065    .lower_extract_byte = true,
4066    .lower_extract_word = true,
4067    .lower_insert_byte = true,
4068    .lower_insert_word = true,
4069    .lower_fdph = true,
4070    .lower_flrp64 = true,
4071    .lower_fmod = true,
4072    .lower_uniforms_to_ubo = true,
4073    .lower_uadd_carry = true,
4074    .lower_usub_borrow = true,
4075    .lower_uadd_sat = true,
4076    .lower_usub_sat = true,
4077    .lower_vector_cmp = true,
4078    .lower_int64_options = nir_lower_imul_2x32_64,
4079    .use_interpolated_input_intrinsics = true,
4080 
4081    /* TGSI doesn't have a semantic for local or global index, just local and
4082     * workgroup id.
4083     */
4084    .lower_cs_local_index_to_id = true,
4085 };
4086 
4087 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
4088  * NIR support.
4089  */
4090 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)4091 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
4092                                  enum pipe_shader_ir ir,
4093                                  unsigned shader)
4094 {
4095    assert(ir == PIPE_SHADER_IR_NIR);
4096    return &nir_to_tgsi_compiler_options;
4097 }
4098 
4099 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
4100 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)4101 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
4102                                  const struct pipe_shader_state *cso)
4103 {
4104    if (cso->type == PIPE_SHADER_IR_NIR) {
4105       return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
4106    } else {
4107       assert(cso->type == PIPE_SHADER_IR_TGSI);
4108       /* we need to keep a local copy of the tokens */
4109       return tgsi_dup_tokens(cso->tokens);
4110    }
4111 }
4112