• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "compiler/nir/nir_worklist.h"
27 #include "nir/nir_to_tgsi.h"
28 #include "pipe/p_screen.h"
29 #include "pipe/p_state.h"
30 #include "tgsi/tgsi_dump.h"
31 #include "tgsi/tgsi_from_mesa.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_ureg.h"
35 #include "tgsi/tgsi_util.h"
36 #include "util/debug.h"
37 #include "util/u_math.h"
38 #include "util/u_memory.h"
39 #include "util/u_dynarray.h"
40 
41 struct ntt_insn {
42    enum tgsi_opcode opcode;
43    struct ureg_dst dst[2];
44    struct ureg_src src[4];
45    enum tgsi_texture_type tex_target;
46    enum tgsi_return_type tex_return_type;
47    struct tgsi_texture_offset tex_offset;
48 
49    unsigned mem_qualifier;
50    enum pipe_format mem_format;
51 
52    bool is_tex : 1;
53    bool is_mem : 1;
54    bool precise : 1;
55 };
56 
57 struct ntt_block {
58    /* Array of struct ntt_insn */
59    struct util_dynarray insns;
60    int start_ip;
61    int end_ip;
62 };
63 
64 struct ntt_reg_interval {
65    uint32_t start, end;
66 };
67 
68 struct ntt_compile {
69    nir_shader *s;
70    nir_function_impl *impl;
71    const struct nir_to_tgsi_options *options;
72    struct pipe_screen *screen;
73    struct ureg_program *ureg;
74 
75    bool needs_texcoord_semantic;
76    bool native_integers;
77    bool has_txf_lz;
78 
79    bool addr_declared[3];
80    struct ureg_dst addr_reg[3];
81 
82    /* if condition set up at the end of a block, for ntt_emit_if(). */
83    struct ureg_src if_cond;
84 
85    /* TGSI temps for our NIR SSA and register values. */
86    struct ureg_dst *reg_temp;
87    struct ureg_src *ssa_temp;
88 
89    struct ntt_reg_interval *liveness;
90 
91    /* Map from nir_block to ntt_block */
92    struct hash_table *blocks;
93    struct ntt_block *cur_block;
94    unsigned current_if_else;
95    unsigned cf_label;
96 
97    /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
98    bool precise;
99 
100    unsigned num_temps;
101    unsigned first_non_array_temp;
102 
103    /* Mappings from driver_location to TGSI input/output number.
104     *
105     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
106     * their numbers assigned incrementally, unlike inputs or constants.
107     */
108    struct ureg_src *input_index_map;
109    uint64_t centroid_inputs;
110 
111    uint32_t first_ubo;
112 
113    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
114 };
115 
116 static struct ureg_dst
ntt_temp(struct ntt_compile * c)117 ntt_temp(struct ntt_compile *c)
118 {
119    return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
120 }
121 
122 static struct ntt_block *
ntt_block_from_nir(struct ntt_compile * c,struct nir_block * block)123 ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
124 {
125    struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
126    return entry->data;
127 }
128 
129 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
130 static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
131 
132 static struct ntt_insn *
ntt_insn(struct ntt_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)133 ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
134          struct ureg_dst dst,
135          struct ureg_src src0, struct ureg_src src1,
136          struct ureg_src src2, struct ureg_src src3)
137 {
138    struct ntt_insn insn = {
139       .opcode = opcode,
140       .dst = { dst, ureg_dst_undef() },
141       .src = { src0, src1, src2, src3 },
142       .precise = c->precise,
143    };
144    util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
145    return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
146 }
147 
148 #define OP00( op )                                                                     \
149 static inline void ntt_##op(struct ntt_compile *c)                                     \
150 {                                                                                      \
151    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
152 }
153 
154 #define OP01( op )                                                                     \
155 static inline void ntt_##op(struct ntt_compile *c,                                     \
156                      struct ureg_src src0)                                             \
157 {                                                                                      \
158    ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
159 }
160 
161 
162 #define OP10( op )                                                                     \
163 static inline void ntt_##op(struct ntt_compile *c,                                     \
164                      struct ureg_dst dst)                                              \
165 {                                                                                      \
166    ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
167 }
168 
169 #define OP11( op )                                                                     \
170 static inline void ntt_##op(struct ntt_compile *c,                                     \
171                      struct ureg_dst dst,                                              \
172                      struct ureg_src src0)                                             \
173 {                                                                                      \
174    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
175 }
176 
177 #define OP12( op )                                                                     \
178 static inline void ntt_##op(struct ntt_compile *c,                                     \
179                      struct ureg_dst dst,                                              \
180                      struct ureg_src src0,                                             \
181                      struct ureg_src src1)                                             \
182 {                                                                                      \
183    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
184 }
185 
186 #define OP13( op )                                                                     \
187 static inline void ntt_##op(struct ntt_compile *c,                                     \
188                      struct ureg_dst dst,                                              \
189                      struct ureg_src src0,                                             \
190                      struct ureg_src src1,                                             \
191                      struct ureg_src src2)                                             \
192 {                                                                                      \
193    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());             \
194 }
195 
196 #define OP14( op )                                                                     \
197 static inline void ntt_##op(struct ntt_compile *c,                                     \
198                      struct ureg_dst dst,                                              \
199                      struct ureg_src src0,                                             \
200                      struct ureg_src src1,                                             \
201                      struct ureg_src src2,                                             \
202                      struct ureg_src src3)                                             \
203 {                                                                                      \
204    ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                         \
205 }
206 
207 /* We hand-craft our tex instructions */
208 #define OP12_TEX(op)
209 #define OP14_TEX(op)
210 
211 /* Use a template include to generate a correctly-typed ntt_OP()
212  * function for each TGSI opcode:
213  */
214 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
215 
216 /**
217  * Interprets a nir_load_const used as a NIR src as a uint.
218  *
219  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
220  * instruction (or in a phi-web used by an integer ALU instruction) were
221  * converted to floats and the ALU instruction swapped to the float equivalent.
222  * However, this means that integer load_consts used by intrinsics (which don't
223  * normally get that conversion) may have been reformatted to be floats.  Given
224  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
225  * we can just look and see if they look like floats and convert them back to
226  * ints.
227  */
228 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)229 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
230 {
231    uint32_t val = nir_src_as_uint(src);
232    if (!c->native_integers && val >= fui(1.0))
233       val = (uint32_t)uif(val);
234    return val;
235 }
236 
237 static unsigned
ntt_64bit_write_mask(unsigned write_mask)238 ntt_64bit_write_mask(unsigned write_mask)
239 {
240    return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
241 }
242 
243 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)244 ntt_64bit_1f(struct ntt_compile *c)
245 {
246    return ureg_imm4u(c->ureg,
247                      0x00000000, 0x3ff00000,
248                      0x00000000, 0x3ff00000);
249 }
250 
251 /* Per-channel masks of def/use within the block, and the per-channel
252  * livein/liveout for the block as a whole.
253  */
254 struct ntt_live_reg_block_state {
255    uint8_t *def, *use, *livein, *liveout, *defin, *defout;
256 };
257 
258 struct ntt_live_reg_state {
259    unsigned bitset_words;
260 
261    struct ntt_reg_interval *regs;
262 
263    /* Used in propagate_across_edge() */
264    BITSET_WORD *tmp_live;
265 
266    struct ntt_live_reg_block_state *blocks;
267 
268    nir_block_worklist worklist;
269 };
270 
271 static void
ntt_live_reg_mark_use(struct ntt_compile * c,struct ntt_live_reg_block_state * bs,int ip,unsigned index,unsigned used_mask)272 ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
273                       int ip, unsigned index, unsigned used_mask)
274 {
275    bs->use[index] |= used_mask & ~bs->def[index];
276 
277    c->liveness[index].start = MIN2(c->liveness[index].start, ip);
278    c->liveness[index].end = MAX2(c->liveness[index].end, ip);
279 
280 }
281 static void
ntt_live_reg_setup_def_use(struct ntt_compile * c,nir_function_impl * impl,struct ntt_live_reg_state * state)282 ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
283 {
284    for (int i = 0; i < impl->num_blocks; i++) {
285       state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
286       state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
287       state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288       state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289       state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
290       state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
291    }
292 
293    int ip = 0;
294    nir_foreach_block(block, impl) {
295       struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
296       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
297 
298       ntt_block->start_ip = ip;
299 
300       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
301          const struct tgsi_opcode_info *opcode_info =
302             tgsi_get_opcode_info(insn->opcode);
303 
304          /* Set up use[] for the srcs.
305           *
306           * Uses are the channels of the reg read in the block that don't have a
307           * preceding def to screen them off.  Note that we don't do per-element
308           * tracking of array regs, so they're never screened off.
309           */
310          for (int i = 0; i < opcode_info->num_src; i++) {
311             if (insn->src[i].File != TGSI_FILE_TEMPORARY)
312                continue;
313             int index = insn->src[i].Index;
314 
315             uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
316                                                               insn->dst->WriteMask,
317                                                               insn->src[i].SwizzleX,
318                                                               insn->src[i].SwizzleY,
319                                                               insn->src[i].SwizzleZ,
320                                                               insn->src[i].SwizzleW,
321                                                               insn->tex_target,
322                                                               insn->tex_target);
323 
324             assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
325             ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
326          }
327 
328          if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY)
329             ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset.Index, 0xf);
330 
331          /* Set up def[] for the srcs.
332           *
333           * Defs are the unconditionally-written (not R/M/W) channels of the reg in
334           * the block that don't have a preceding use.
335           */
336          for (int i = 0; i < opcode_info->num_dst; i++) {
337             if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
338                continue;
339             int index = insn->dst[i].Index;
340             uint32_t writemask = insn->dst[i].WriteMask;
341 
342             bs->def[index] |= writemask & ~bs->use[index];
343             bs->defout[index] |= writemask;
344 
345             assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
346             c->liveness[index].start = MIN2(c->liveness[index].start, ip);
347             c->liveness[index].end = MAX2(c->liveness[index].end, ip);
348          }
349          ip++;
350       }
351 
352       ntt_block->end_ip = ip;
353    }
354 }
355 
356 static void
ntt_live_regs(struct ntt_compile * c,nir_function_impl * impl)357 ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
358 {
359    nir_metadata_require(impl, nir_metadata_block_index);
360 
361    c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
362 
363    struct ntt_live_reg_state state = {
364        .blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
365    };
366 
367    /* The intervals start out with start > end (indicating unused) */
368    for (int i = 0; i < c->num_temps; i++)
369       c->liveness[i].start = ~0;
370 
371    ntt_live_reg_setup_def_use(c, impl, &state);
372 
373    /* Make a forward-order worklist of all the blocks. */
374    nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
375    nir_foreach_block(block, impl) {
376       nir_block_worklist_push_tail(&state.worklist, block);
377    }
378 
379    /* Propagate defin/defout down the CFG to calculate the live variables
380     * potentially defined along any possible control flow path.  We'll use this
381     * to keep things like conditional defs of the reg (or array regs where we
382     * don't track defs!) from making the reg's live range extend back to the
383     * start of the program.
384     */
385    while (!nir_block_worklist_is_empty(&state.worklist)) {
386       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
387       for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
388          nir_block *succ = block->successors[j];
389          if (!succ || succ->index == impl->num_blocks)
390             continue;
391 
392          for (int i = 0; i < c->num_temps; i++) {
393             uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
394 
395             if (new_def) {
396                state.blocks[succ->index].defin[i] |= new_def;
397                state.blocks[succ->index].defout[i] |= new_def;
398                nir_block_worklist_push_tail(&state.worklist, succ);
399             }
400          }
401       }
402    }
403 
404    /* Make a reverse-order worklist of all the blocks. */
405    nir_foreach_block(block, impl) {
406       nir_block_worklist_push_head(&state.worklist, block);
407    }
408 
409    /* We're now ready to work through the worklist and update the liveness sets
410     * of each of the blocks.  As long as we keep the worklist up-to-date as we
411     * go, everything will get covered.
412     */
413    while (!nir_block_worklist_is_empty(&state.worklist)) {
414       /* We pop them off in the reverse order we pushed them on.  This way
415        * the first walk of the instructions is backwards so we only walk
416        * once in the case of no control flow.
417        */
418       nir_block *block = nir_block_worklist_pop_head(&state.worklist);
419       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
420       struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
421 
422       for (int i = 0; i < c->num_temps; i++) {
423          /* Collect livein from our successors to include in our liveout. */
424          for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
425             nir_block *succ = block->successors[j];
426             if (!succ || succ->index == impl->num_blocks)
427                continue;
428             struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
429 
430             uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
431             if (new_liveout) {
432                if (state.blocks[block->index].defout[i])
433                   c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
434                bs->liveout[i] |= sbs->livein[i];
435             }
436          }
437 
438          /* Propagate use requests from either our block's uses or our
439           * non-screened-off liveout up to our predecessors.
440           */
441          uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
442                                ~bs->livein[i]);
443          if (new_livein) {
444             bs->livein[i] |= new_livein;
445             set_foreach(block->predecessors, entry) {
446                nir_block *pred = (void *)entry->key;
447                nir_block_worklist_push_tail(&state.worklist, pred);
448             }
449 
450             if (new_livein & state.blocks[block->index].defin[i])
451                c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
452          }
453       }
454    }
455 
456    ralloc_free(state.blocks);
457    nir_block_worklist_fini(&state.worklist);
458 }
459 
460 static void
ntt_ra_check(struct ntt_compile * c,unsigned * ra_map,BITSET_WORD * released,int ip,unsigned index)461 ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
462 {
463    if (index < c->first_non_array_temp)
464       return;
465 
466    if (c->liveness[index].start == ip && ra_map[index] == ~0)
467       ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
468 
469    if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
470       ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
471       BITSET_SET(released, index);
472    }
473 }
474 
475 static void
ntt_allocate_regs(struct ntt_compile * c,nir_function_impl * impl)476 ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
477 {
478    ntt_live_regs(c, impl);
479 
480    unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
481    unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
482 
483    /* No RA on NIR array regs */
484    for (int i = 0; i < c->first_non_array_temp; i++)
485       ra_map[i] = i;
486 
487    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
488       ra_map[i] = ~0;
489 
490    int ip = 0;
491    nir_foreach_block(block, impl) {
492       struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
493 
494       for (int i = 0; i < c->num_temps; i++)
495          ntt_ra_check(c, ra_map, released, ip, i);
496 
497       util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
498          const struct tgsi_opcode_info *opcode_info =
499             tgsi_get_opcode_info(insn->opcode);
500 
501          for (int i = 0; i < opcode_info->num_src; i++) {
502             if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
503                ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
504                insn->src[i].Index = ra_map[insn->src[i].Index];
505             }
506          }
507 
508          if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) {
509             ntt_ra_check(c, ra_map, released, ip, insn->tex_offset.Index);
510             insn->tex_offset.Index = ra_map[insn->tex_offset.Index];
511          }
512 
513          for (int i = 0; i < opcode_info->num_dst; i++) {
514             if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
515                ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
516                insn->dst[i].Index = ra_map[insn->dst[i].Index];
517             }
518          }
519          ip++;
520       }
521 
522       for (int i = 0; i < c->num_temps; i++)
523          ntt_ra_check(c, ra_map, released, ip, i);
524    }
525 }
526 
527 static void
ntt_allocate_regs_unoptimized(struct ntt_compile * c,nir_function_impl * impl)528 ntt_allocate_regs_unoptimized(struct ntt_compile *c, nir_function_impl *impl)
529 {
530    for (int i = c->first_non_array_temp; i < c->num_temps; i++)
531       ureg_DECL_temporary(c->ureg);
532 }
533 
534 
535 /**
536  * Try to find an iadd of a constant value with a non-constant value in the
537  * nir_src's first component, returning the constant offset and replacing *src
538  * with the non-constant component.
539  */
540 static const uint32_t
ntt_extract_const_src_offset(nir_src * src)541 ntt_extract_const_src_offset(nir_src *src)
542 {
543    if (!src->is_ssa)
544       return 0;
545 
546    nir_ssa_scalar s = nir_get_ssa_scalar(src->ssa, 0);
547 
548    while (nir_ssa_scalar_is_alu(s)) {
549       nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
550 
551       for (int i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
552          if (!alu->src[i].src.is_ssa)
553             return 0;
554       }
555 
556       if (alu->op == nir_op_iadd) {
557          for (int i = 0; i < 2; i++) {
558             nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
559             if (v && !alu->src[i].negate && !alu->src[i].abs) {
560                *src = alu->src[1 - i].src;
561                return v[alu->src[i].swizzle[s.comp]].u32;
562             }
563          }
564 
565          return 0;
566       }
567 
568       /* We'd like to reuse nir_ssa_scalar_chase_movs(), but it assumes SSA and that
569        * seems reasonable for something used in inner loops of the compiler.
570        */
571       if (!nir_alu_instr_is_copy(alu))
572          return 0;
573 
574       if (alu->op == nir_op_mov) {
575          s.def = alu->src[0].src.ssa;
576          s.comp = alu->src[0].swizzle[s.comp];
577       } else if (nir_op_is_vec(alu->op)) {
578          s.def = alu->src[s.comp].src.ssa;
579          s.comp = alu->src[s.comp].swizzle[0];
580       } else {
581          return 0;
582       }
583    }
584 
585    return 0;
586 }
587 
588 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)589 ntt_shader_input_type(struct ntt_compile *c,
590                       struct nir_variable *var)
591 {
592    switch (c->s->info.stage) {
593    case MESA_SHADER_GEOMETRY:
594    case MESA_SHADER_TESS_EVAL:
595    case MESA_SHADER_TESS_CTRL:
596       if (glsl_type_is_array(var->type))
597          return glsl_get_array_element(var->type);
598       else
599          return var->type;
600    default:
601       return var->type;
602    }
603 }
604 
605 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)606 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
607                             unsigned *semantic_name, unsigned *semantic_index)
608 {
609    /* We want to use most of tgsi_get_gl_varying_semantic(), but the
610     * !texcoord shifting has already been applied, so avoid that.
611     */
612    if (!c->needs_texcoord_semantic &&
613        (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
614       *semantic_name = TGSI_SEMANTIC_GENERIC;
615       *semantic_index = location - VARYING_SLOT_VAR0;
616       return;
617    }
618 
619    tgsi_get_gl_varying_semantic(location, true,
620                                 semantic_name, semantic_index);
621 }
622 
623 /* TGSI varying declarations have a component usage mask associated (used by
624  * r600 and svga).
625  */
626 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)627 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
628                     bool is_64)
629 {
630    uint32_t usage_mask =
631       u_bit_consecutive(start_component, num_components);
632 
633    if (is_64) {
634       if (start_component >= 2)
635          usage_mask >>= 2;
636 
637       uint32_t tgsi_usage_mask = 0;
638 
639       if (usage_mask & TGSI_WRITEMASK_X)
640          tgsi_usage_mask |= TGSI_WRITEMASK_XY;
641       if (usage_mask & TGSI_WRITEMASK_Y)
642          tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
643 
644       return tgsi_usage_mask;
645    } else {
646       return usage_mask;
647    }
648 }
649 
650 /* TGSI varying declarations have a component usage mask associated (used by
651  * r600 and svga).
652  */
653 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)654 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
655 {
656    const struct glsl_type *type_without_array =
657       glsl_without_array(var->type);
658    unsigned num_components = glsl_get_vector_elements(type_without_array);
659    if (num_components == 0) /* structs */
660       num_components = 4;
661 
662    return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
663                               glsl_type_is_64bit(type_without_array));
664 }
665 
666 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)667 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
668 {
669    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
670    int base = nir_intrinsic_base(instr);
671    *frac = nir_intrinsic_component(instr);
672    bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
673 
674    struct ureg_dst out;
675    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
676       unsigned semantic_name, semantic_index;
677       tgsi_get_gl_frag_result_semantic(semantics.location,
678                                        &semantic_name, &semantic_index);
679       semantic_index += semantics.dual_source_blend_index;
680 
681       switch (semantics.location) {
682       case FRAG_RESULT_DEPTH:
683          *frac = 2; /* z write is the to the .z channel in TGSI */
684          break;
685       case FRAG_RESULT_STENCIL:
686          *frac = 1;
687          break;
688       default:
689          break;
690       }
691 
692       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
693    } else {
694       unsigned semantic_name, semantic_index;
695 
696       ntt_get_gl_varying_semantic(c, semantics.location,
697                                   &semantic_name, &semantic_index);
698 
699       uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
700                                                 instr->num_components,
701                                                 is_64);
702       uint32_t gs_streams = semantics.gs_streams;
703       for (int i = 0; i < 4; i++) {
704          if (!(usage_mask & (1 << i)))
705             gs_streams &= ~(0x3 << 2 * i);
706       }
707 
708       /* No driver appears to use array_id of outputs. */
709       unsigned array_id = 0;
710 
711       /* This bit is lost in the i/o semantics, but it's unused in in-tree
712        * drivers.
713        */
714       bool invariant = semantics.invariant;
715 
716       out = ureg_DECL_output_layout(c->ureg,
717                                     semantic_name, semantic_index,
718                                     gs_streams,
719                                     base,
720                                     usage_mask,
721                                     array_id,
722                                     semantics.num_slots,
723                                     invariant);
724    }
725 
726    unsigned write_mask;
727    if (nir_intrinsic_has_write_mask(instr))
728       write_mask = nir_intrinsic_write_mask(instr);
729    else
730       write_mask = ((1 << instr->num_components) - 1) << *frac;
731 
732    if (is_64) {
733       write_mask = ntt_64bit_write_mask(write_mask);
734       if (*frac >= 2)
735          write_mask = write_mask << 2;
736    } else {
737       write_mask = write_mask << *frac;
738    }
739    return ureg_writemask(out, write_mask);
740 }
741 
742 /* If this reg or SSA def is used only for storing an output, then in the simple
743  * cases we can write directly to the TGSI output instead of having store_output
744  * emit its own MOV.
745  */
746 static bool
ntt_try_store_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,struct list_head * uses,struct list_head * if_uses)747 ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
748                              struct list_head *uses, struct list_head *if_uses)
749 {
750    *dst = ureg_dst_undef();
751 
752    switch (c->s->info.stage) {
753    case MESA_SHADER_FRAGMENT:
754    case MESA_SHADER_VERTEX:
755       break;
756    default:
757       /* tgsi_exec (at least) requires that output stores happen per vertex
758        * emitted, you don't get to reuse a previous output value for the next
759        * vertex.
760        */
761       return false;
762    }
763 
764    if (!list_is_empty(if_uses) || !list_is_singular(uses))
765       return false;
766 
767    nir_src *src = list_first_entry(uses, nir_src, use_link);
768 
769    if (src->parent_instr->type != nir_instr_type_intrinsic)
770       return false;
771 
772    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
773    if (intr->intrinsic != nir_intrinsic_store_output ||
774        !nir_src_is_const(intr->src[1])) {
775       return false;
776    }
777 
778    uint32_t frac;
779    *dst = ntt_output_decl(c, intr, &frac);
780    dst->Index += ntt_src_as_uint(c, intr->src[1]);
781 
782    return frac == 0;
783 }
784 
785 static void
ntt_setup_inputs(struct ntt_compile * c)786 ntt_setup_inputs(struct ntt_compile *c)
787 {
788    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
789       return;
790 
791    unsigned num_inputs = 0;
792    int num_input_arrays = 0;
793 
794    nir_foreach_shader_in_variable(var, c->s) {
795       const struct glsl_type *type = ntt_shader_input_type(c, var);
796       unsigned array_len =
797          glsl_count_attribute_slots(type, false);
798 
799       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
800    }
801 
802    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
803 
804    nir_foreach_shader_in_variable(var, c->s) {
805       const struct glsl_type *type = ntt_shader_input_type(c, var);
806       unsigned array_len =
807          glsl_count_attribute_slots(type, false);
808 
809       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
810       unsigned sample_loc;
811       struct ureg_src decl;
812 
813       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
814          interpolation =
815             tgsi_get_interp_mode(var->data.interpolation,
816                                  var->data.location == VARYING_SLOT_COL0 ||
817                                  var->data.location == VARYING_SLOT_COL1);
818 
819          if (var->data.location == VARYING_SLOT_POS)
820             interpolation = TGSI_INTERPOLATE_LINEAR;
821       }
822 
823       unsigned semantic_name, semantic_index;
824       ntt_get_gl_varying_semantic(c, var->data.location,
825                                   &semantic_name, &semantic_index);
826 
827       if (var->data.sample) {
828          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
829       } else if (var->data.centroid) {
830          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
831          c->centroid_inputs |= (BITSET_MASK(array_len) <<
832                                 var->data.driver_location);
833       } else {
834          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
835       }
836 
837       unsigned array_id = 0;
838       if (glsl_type_is_array(type))
839          array_id = ++num_input_arrays;
840 
841       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
842 
843       decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
844                                                 semantic_name,
845                                                 semantic_index,
846                                                 interpolation,
847                                                 sample_loc,
848                                                 var->data.driver_location,
849                                                 usage_mask,
850                                                 array_id, array_len);
851 
852       if (semantic_name == TGSI_SEMANTIC_FACE) {
853          struct ureg_dst temp = ntt_temp(c);
854          if (c->native_integers) {
855             /* NIR is ~0 front and 0 back, while TGSI is +1 front */
856             ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
857          } else {
858             /* tgsi docs say that floating point FACE will be positive for
859              * frontface and negative for backface, but realistically
860              * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
861              * Copy that behavior, since some drivers (r300) have been doing a
862              * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
863              * front face).
864              */
865             temp.Saturate = true;
866             ntt_MOV(c, temp, decl);
867 
868          }
869          decl = ureg_src(temp);
870       }
871 
872       for (unsigned i = 0; i < array_len; i++) {
873          c->input_index_map[var->data.driver_location + i] = decl;
874          c->input_index_map[var->data.driver_location + i].Index += i;
875       }
876    }
877 }
878 
879 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)880 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
881 {
882    return a->data.location - b->data.location;
883 }
884 
885 /**
886  * Workaround for virglrenderer requiring that TGSI FS output color variables
887  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
888  */
889 static void
ntt_setup_outputs(struct ntt_compile * c)890 ntt_setup_outputs(struct ntt_compile *c)
891 {
892    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
893       return;
894 
895    nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
896 
897    nir_foreach_shader_out_variable(var, c->s) {
898       if (var->data.location == FRAG_RESULT_COLOR)
899          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
900 
901       unsigned semantic_name, semantic_index;
902       tgsi_get_gl_frag_result_semantic(var->data.location,
903                                        &semantic_name, &semantic_index);
904 
905       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
906    }
907 }
908 
909 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)910 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
911 {
912    switch (dim) {
913    case GLSL_SAMPLER_DIM_1D:
914       if (is_shadow)
915          return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
916       else
917          return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
918    case GLSL_SAMPLER_DIM_2D:
919    case GLSL_SAMPLER_DIM_EXTERNAL:
920       if (is_shadow)
921          return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
922       else
923          return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
924    case GLSL_SAMPLER_DIM_3D:
925       return TGSI_TEXTURE_3D;
926    case GLSL_SAMPLER_DIM_CUBE:
927       if (is_shadow)
928          return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
929       else
930          return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
931    case GLSL_SAMPLER_DIM_RECT:
932       if (is_shadow)
933          return TGSI_TEXTURE_SHADOWRECT;
934       else
935          return TGSI_TEXTURE_RECT;
936    case GLSL_SAMPLER_DIM_MS:
937       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
938    case GLSL_SAMPLER_DIM_BUF:
939       return TGSI_TEXTURE_BUFFER;
940    default:
941       unreachable("unknown sampler dim");
942    }
943 }
944 
945 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)946 tgsi_return_type_from_base_type(enum glsl_base_type type)
947 {
948    switch (type) {
949    case GLSL_TYPE_INT:
950       return TGSI_RETURN_TYPE_SINT;
951    case GLSL_TYPE_UINT:
952       return TGSI_RETURN_TYPE_UINT;
953    case GLSL_TYPE_FLOAT:
954      return TGSI_RETURN_TYPE_FLOAT;
955    default:
956       unreachable("unexpected texture type");
957    }
958 }
959 
960 static void
ntt_setup_uniforms(struct ntt_compile * c)961 ntt_setup_uniforms(struct ntt_compile *c)
962 {
963    nir_foreach_uniform_variable(var, c->s) {
964       if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
965           glsl_type_is_texture(glsl_without_array(var->type))) {
966          /* Don't use this size for the check for samplers -- arrays of structs
967           * containing samplers should be ignored, and just the separate lowered
968           * sampler uniform decl used.
969           */
970          int size = glsl_type_get_sampler_count(var->type) +
971                     glsl_type_get_texture_count(var->type);
972 
973          const struct glsl_type *stype = glsl_without_array(var->type);
974          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
975                                                                             glsl_sampler_type_is_array(stype),
976                                                                             glsl_sampler_type_is_shadow(stype));
977          enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
978          for (int i = 0; i < size; i++) {
979             ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
980                target, ret_type, ret_type, ret_type, ret_type);
981             ureg_DECL_sampler(c->ureg, var->data.binding + i);
982          }
983       } else if (glsl_contains_atomic(var->type)) {
984          uint32_t offset = var->data.offset / 4;
985          uint32_t size = glsl_atomic_size(var->type) / 4;
986          ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
987       }
988 
989       /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
990        * size declaration happens with other UBOs below.
991        */
992    }
993 
994    nir_foreach_image_variable(var, c->s) {
995       int image_count = glsl_type_get_image_count(var->type);
996       const struct glsl_type *itype = glsl_without_array(var->type);
997       enum tgsi_texture_type tex_type =
998             tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
999                                                glsl_sampler_type_is_array(itype), false);
1000 
1001       for (int i = 0; i < image_count; i++) {
1002          c->images[var->data.binding] = ureg_DECL_image(c->ureg,
1003                                                         var->data.binding + i,
1004                                                         tex_type,
1005                                                         var->data.image.format,
1006                                                         !(var->data.access & ACCESS_NON_WRITEABLE),
1007                                                         false);
1008       }
1009    }
1010 
1011    c->first_ubo = ~0;
1012 
1013    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1014    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1015       int ubo = var->data.driver_location;
1016       if (ubo == -1)
1017          continue;
1018 
1019       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1020          c->first_ubo = MIN2(c->first_ubo, ubo);
1021 
1022       unsigned size = glsl_get_explicit_size(var->interface_type, false);
1023 
1024       int array_size = 1;
1025       if (glsl_type_is_interface(glsl_without_array(var->type)))
1026          array_size = MAX2(1, glsl_get_aoa_size(var->type));
1027 
1028       for (int i = 0; i < array_size; i++) {
1029          /* Even if multiple NIR variables are in the same uniform block, their
1030           * explicit size is the size of the block.
1031           */
1032          if (ubo_sizes[ubo + i])
1033             assert(ubo_sizes[ubo + i] == size);
1034 
1035          ubo_sizes[ubo + i] = size;
1036       }
1037    }
1038 
1039    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1040       if (ubo_sizes[i])
1041          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1042    }
1043 
1044    for (int i = 0; i < c->s->info.num_ssbos; i++) {
1045       /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1046        * counters
1047        */
1048       bool atomic = false;
1049       ureg_DECL_buffer(c->ureg, i, atomic);
1050    }
1051 }
1052 
1053 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)1054 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
1055 {
1056    assert(c->num_temps == 0);
1057    /* Permanently allocate all the array regs at the start. */
1058    foreach_list_typed(nir_register, nir_reg, node, list) {
1059       if (nir_reg->num_array_elems != 0) {
1060          struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true);
1061          c->reg_temp[nir_reg->index] = decl;
1062          assert(c->num_temps == decl.Index);
1063          c->num_temps += nir_reg->num_array_elems;
1064       }
1065    }
1066    c->first_non_array_temp = c->num_temps;
1067 
1068    /* After that, allocate non-array regs in our virtual space that we'll
1069     * register-allocate before ureg emit.
1070     */
1071    foreach_list_typed(nir_register, nir_reg, node, list) {
1072       if (nir_reg->num_array_elems == 0) {
1073          struct ureg_dst decl;
1074          uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
1075          if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
1076             if (nir_reg->bit_size == 64) {
1077                if (nir_reg->num_components > 2) {
1078                   fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1079                         nir_reg->num_components, nir_reg->index);
1080                }
1081 
1082                write_mask = ntt_64bit_write_mask(write_mask);
1083             }
1084 
1085             decl = ureg_writemask(ntt_temp(c), write_mask);
1086          }
1087          c->reg_temp[nir_reg->index] = decl;
1088       }
1089    }
1090 }
1091 
1092 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)1093 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1094 {
1095    int num_components = instr->def.num_components;
1096 
1097    if (!c->native_integers) {
1098       float values[4];
1099       assert(instr->def.bit_size == 32);
1100       for (int i = 0; i < num_components; i++)
1101          values[i] = uif(instr->value[i].u32);
1102 
1103       return ureg_DECL_immediate(c->ureg, values, num_components);
1104    } else {
1105       uint32_t values[4];
1106 
1107       if (instr->def.bit_size == 32) {
1108          for (int i = 0; i < num_components; i++)
1109             values[i] = instr->value[i].u32;
1110       } else {
1111          assert(num_components <= 2);
1112          for (int i = 0; i < num_components; i++) {
1113             values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1114             values[i * 2 + 1] = instr->value[i].u64 >> 32;
1115          }
1116          num_components *= 2;
1117       }
1118 
1119       return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1120    }
1121 }
1122 
1123 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr,int addr_index)1124 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1125 {
1126    assert(addr_index < ARRAY_SIZE(c->addr_reg));
1127 
1128    for (int i = 0; i <= addr_index; i++) {
1129       if (!c->addr_declared[i]) {
1130          c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1131                                              TGSI_WRITEMASK_X);
1132          c->addr_declared[i] = true;
1133       }
1134    }
1135 
1136    if (c->native_integers)
1137       ntt_UARL(c, c->addr_reg[addr_index], addr);
1138    else
1139       ntt_ARL(c, c->addr_reg[addr_index], addr);
1140    return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1141 }
1142 
1143 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)1144 ntt_get_src(struct ntt_compile *c, nir_src src)
1145 {
1146    if (src.is_ssa) {
1147       if (src.ssa->parent_instr->type == nir_instr_type_load_const)
1148          return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
1149 
1150       return c->ssa_temp[src.ssa->index];
1151    } else {
1152       nir_register *reg = src.reg.reg;
1153       struct ureg_dst reg_temp = c->reg_temp[reg->index];
1154       reg_temp.Index += src.reg.base_offset;
1155 
1156       if (src.reg.indirect) {
1157          struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
1158          return ureg_src_indirect(ureg_src(reg_temp),
1159                                   ntt_reladdr(c, offset, 0));
1160       } else {
1161          return ureg_src(reg_temp);
1162       }
1163    }
1164 }
1165 
1166 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)1167 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1168 {
1169    nir_alu_src src = instr->src[i];
1170    struct ureg_src usrc = ntt_get_src(c, src.src);
1171 
1172    /* Expand double/dvec2 src references to TGSI swizzles using a pair of 32-bit
1173     * channels.  We skip this for undefs, as those don't get split to vec2s (but
1174     * the specific swizzles from an undef don't matter)
1175     */
1176    if (nir_src_bit_size(src.src) == 64 &&
1177       !(src.src.is_ssa && src.src.ssa->parent_instr->type == nir_instr_type_ssa_undef)) {
1178       int chan0 = 0, chan1 = 1;
1179       if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1180          chan0 = ffs(instr->dest.write_mask) - 1;
1181          chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
1182          if (chan1 == -1)
1183             chan1 = chan0;
1184       }
1185       usrc = ureg_swizzle(usrc,
1186                           src.swizzle[chan0] * 2,
1187                           src.swizzle[chan0] * 2 + 1,
1188                           src.swizzle[chan1] * 2,
1189                           src.swizzle[chan1] * 2 + 1);
1190    } else {
1191       usrc = ureg_swizzle(usrc,
1192                           src.swizzle[0],
1193                           src.swizzle[1],
1194                           src.swizzle[2],
1195                           src.swizzle[3]);
1196    }
1197 
1198    if (src.abs)
1199       usrc = ureg_abs(usrc);
1200    if (src.negate)
1201       usrc = ureg_negate(usrc);
1202 
1203    return usrc;
1204 }
1205 
1206 /* Reswizzles a source so that the unset channels in the write mask still refer
1207  * to one of the channels present in the write mask.
1208  */
1209 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)1210 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1211 {
1212    assert(write_mask);
1213    int first_chan = ffs(write_mask) - 1;
1214    return ureg_swizzle(src,
1215                        (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1216                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1217                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1218                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1219 }
1220 
1221 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)1222 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
1223 {
1224    uint32_t writemask = BITSET_MASK(ssa->num_components);
1225    if (ssa->bit_size == 64)
1226       writemask = ntt_64bit_write_mask(writemask);
1227 
1228    struct ureg_dst dst;
1229    if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
1230       dst = ntt_temp(c);
1231 
1232    c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1233 
1234    return ureg_writemask(dst, writemask);
1235 }
1236 
1237 static struct ureg_dst
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)1238 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
1239 {
1240    if (dest->is_ssa)
1241       return ntt_get_ssa_def_decl(c, &dest->ssa);
1242    else
1243       return c->reg_temp[dest->reg.reg->index];
1244 }
1245 
1246 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)1247 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
1248 {
1249    struct ureg_dst dst = ntt_get_dest_decl(c, dest);
1250 
1251    if (!dest->is_ssa) {
1252       dst.Index += dest->reg.base_offset;
1253 
1254       if (dest->reg.indirect) {
1255          struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
1256          dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1257       }
1258    }
1259 
1260    return dst;
1261 }
1262 
1263 /* For an SSA dest being populated by a constant src, replace the storage with
1264  * a copy of the ureg_src.
1265  */
1266 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)1267 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
1268 {
1269    if (!src.Indirect && !src.DimIndirect) {
1270       switch (src.File) {
1271       case TGSI_FILE_IMMEDIATE:
1272       case TGSI_FILE_INPUT:
1273       case TGSI_FILE_CONSTANT:
1274       case TGSI_FILE_SYSTEM_VALUE:
1275          c->ssa_temp[def->index] = src;
1276          return;
1277       }
1278    }
1279 
1280    ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1281 }
1282 
1283 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)1284 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
1285 {
1286    if (dest->is_ssa)
1287       ntt_store_def(c, &dest->ssa, src);
1288    else {
1289       struct ureg_dst dst = ntt_get_dest(c, dest);
1290       ntt_MOV(c, dst, src);
1291    }
1292 }
1293 
1294 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)1295 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1296                 struct ureg_dst dst,
1297                 struct ureg_src src0,
1298                 struct ureg_src src1)
1299 {
1300    unsigned i;
1301 
1302    /* POW is the only 2-operand scalar op. */
1303    if (tgsi_op != TGSI_OPCODE_POW)
1304       src1 = src0;
1305 
1306    for (i = 0; i < 4; i++) {
1307       if (dst.WriteMask & (1 << i)) {
1308          ntt_insn(c, tgsi_op,
1309                   ureg_writemask(dst, 1 << i),
1310                   ureg_scalar(src0, i),
1311                   ureg_scalar(src1, i),
1312                   ureg_src_undef(), ureg_src_undef());
1313       }
1314    }
1315 }
1316 
1317 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)1318 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1319 {
1320    struct ureg_src src[4];
1321    struct ureg_dst dst;
1322    unsigned i;
1323    int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
1324    int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1325    int num_srcs = nir_op_infos[instr->op].num_inputs;
1326 
1327    c->precise = instr->exact;
1328 
1329    assert(num_srcs <= ARRAY_SIZE(src));
1330    for (i = 0; i < num_srcs; i++)
1331       src[i] = ntt_get_alu_src(c, instr, i);
1332    for (; i < ARRAY_SIZE(src); i++)
1333       src[i] = ureg_src_undef();
1334 
1335    dst = ntt_get_dest(c, &instr->dest.dest);
1336 
1337    if (instr->dest.saturate)
1338       dst.Saturate = true;
1339 
1340    if (dst_64)
1341       dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
1342    else
1343       dst = ureg_writemask(dst, instr->dest.write_mask);
1344 
1345    static enum tgsi_opcode op_map[][2] = {
1346       [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1347 
1348       /* fabs/fneg 32-bit are special-cased below. */
1349       [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1350       [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1351 
1352       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1353       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1354       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1355       [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1356       [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1357       [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1358       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1359       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1360       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1361       [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1362       [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1363       [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1364       [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1365 
1366       [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1367       [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1368       [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1369 
1370       /* The conversions will have one combination of src and dst bitsize. */
1371       [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1372       [nir_op_f2f64] = { TGSI_OPCODE_F2D },
1373       [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1374 
1375       [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1376       [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1377       [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1378       [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1379       [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1380       [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1381       [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1382       [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1383 
1384       [nir_op_slt] = { TGSI_OPCODE_SLT },
1385       [nir_op_sge] = { TGSI_OPCODE_SGE },
1386       [nir_op_seq] = { TGSI_OPCODE_SEQ },
1387       [nir_op_sne] = { TGSI_OPCODE_SNE },
1388 
1389       [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1390       [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1391       [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1392       [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1393 
1394       [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1395       [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1396       [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1397       [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1398 
1399       [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1400       [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1401 
1402       [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1403       [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1404       [nir_op_fsign] = { TGSI_OPCODE_SSG },
1405       [nir_op_isign] = { TGSI_OPCODE_ISSG },
1406       [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1407       [nir_op_fddx] = { TGSI_OPCODE_DDX },
1408       [nir_op_fddy] = { TGSI_OPCODE_DDY },
1409       [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1410       [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1411       [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1412       [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1413       [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1414       [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1415       [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1416       [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1417       [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1418       [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1419       [nir_op_bit_count] = { TGSI_OPCODE_POPC },
1420       [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1421       [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1422       [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1423       [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1424       [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1425       [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1426       [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1427       [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1428       [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1429       [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1430       [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1431       [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1432       [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1433       [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1434 
1435       /* These bitwise ops don't care about 32 vs 64 types, so they have the
1436        * same TGSI op.
1437        */
1438       [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1439       [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1440       [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1441       [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1442 
1443       [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1444       [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1445       [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1446       [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1447       [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1448       [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1449       [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1450       [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1451    };
1452 
1453    if (src_64 && !dst_64) {
1454       if (num_srcs == 2 || nir_op_infos[instr->op].output_type == nir_type_bool32) {
1455          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1456          * of .xy.
1457          */
1458          assert(!(dst.WriteMask & TGSI_WRITEMASK_YW));
1459       } else {
1460          /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1461          * channels and will need to get fixed up.
1462          */
1463         assert(!(dst.WriteMask & TGSI_WRITEMASK_ZW));
1464       }
1465    }
1466 
1467    bool table_op64 = src_64;
1468    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1469       /* The normal path for NIR to TGSI ALU op translation */
1470       ntt_insn(c, op_map[instr->op][table_op64],
1471                 dst, src[0], src[1], src[2], src[3]);
1472    } else {
1473       /* Special cases for NIR to TGSI ALU op translation. */
1474 
1475       /* TODO: Use something like the ntt_store() path for the MOV calls so we
1476        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1477        */
1478 
1479       switch (instr->op) {
1480       case nir_op_u2u64:
1481          ntt_AND(c, dst, ureg_swizzle(src[0],
1482                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1483                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1484                   ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1485          break;
1486 
1487       case nir_op_i2i32:
1488       case nir_op_u2u32:
1489          assert(src_64);
1490          ntt_MOV(c, dst, ureg_swizzle(src[0],
1491                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1492                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1493          break;
1494 
1495       case nir_op_fabs:
1496          if (c->options->lower_fabs)
1497             ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1498          else
1499             ntt_MOV(c, dst, ureg_abs(src[0]));
1500          break;
1501 
1502       case nir_op_fsat:
1503          if (dst_64) {
1504             ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1505             ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1506          } else {
1507             ntt_MOV(c, ureg_saturate(dst), src[0]);
1508          }
1509          break;
1510 
1511       case nir_op_fneg:
1512          ntt_MOV(c, dst, ureg_negate(src[0]));
1513          break;
1514 
1515          /* NOTE: TGSI 32-bit math ops have the old "one source channel
1516           * replicated to all dst channels" behavior, while 64 is normal mapping
1517           * of src channels to dst.
1518           */
1519       case nir_op_frcp:
1520          assert(!dst_64);
1521          ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1522          break;
1523 
1524       case nir_op_frsq:
1525          assert(!dst_64);
1526          ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1527          break;
1528 
1529       case nir_op_fsqrt:
1530          assert(!dst_64);
1531          ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1532          break;
1533 
1534       case nir_op_fexp2:
1535          assert(!dst_64);
1536          ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1537          break;
1538 
1539       case nir_op_flog2:
1540          assert(!dst_64);
1541          ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1542          break;
1543 
1544       case nir_op_b2f32:
1545          ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1546          break;
1547 
1548       case nir_op_b2f64:
1549          ntt_AND(c, dst,
1550                   ureg_swizzle(src[0],
1551                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1552                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1553                   ntt_64bit_1f(c));
1554          break;
1555 
1556       case nir_op_f2b32:
1557          if (src_64)
1558             ntt_DSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1559          else
1560             ntt_FSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1561          break;
1562 
1563       case nir_op_i2b32:
1564          if (src_64) {
1565             ntt_U64SNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1566          } else
1567             ntt_USNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1568          break;
1569 
1570       case nir_op_b2i32:
1571          ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1572          break;
1573 
1574       case nir_op_b2i64:
1575          ntt_AND(c, dst,
1576                   ureg_swizzle(src[0],
1577                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1578                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1579                   ureg_imm4u(c->ureg, 1, 0, 1, 0));
1580          break;
1581 
1582       case nir_op_fsin:
1583          ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1584          break;
1585 
1586       case nir_op_fcos:
1587          ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1588          break;
1589 
1590       case nir_op_fsub:
1591          assert(!dst_64);
1592          ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1593          break;
1594 
1595       case nir_op_isub:
1596          assert(!dst_64);
1597          ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1598          break;
1599 
1600       case nir_op_fmod:
1601          unreachable("should be handled by .lower_fmod = true");
1602          break;
1603 
1604       case nir_op_fpow:
1605          ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1606          break;
1607 
1608       case nir_op_flrp:
1609          ntt_LRP(c, dst, src[2], src[1], src[0]);
1610          break;
1611 
1612       case nir_op_pack_64_2x32_split:
1613          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1614                   ureg_swizzle(src[0],
1615                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1616                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1617          ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1618                   ureg_swizzle(src[1],
1619                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1620                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1621          break;
1622 
1623       case nir_op_unpack_64_2x32_split_x:
1624          ntt_MOV(c, dst, ureg_swizzle(src[0],
1625                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1626                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1627          break;
1628 
1629       case nir_op_unpack_64_2x32_split_y:
1630          ntt_MOV(c, dst, ureg_swizzle(src[0],
1631                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1632                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1633          break;
1634 
1635       case nir_op_b32csel:
1636          if (nir_src_bit_size(instr->src[1].src) == 64) {
1637             ntt_UCMP(c, dst, ureg_swizzle(src[0],
1638                                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1639                                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1640                       src[1], src[2]);
1641          } else {
1642             ntt_UCMP(c, dst, src[0], src[1], src[2]);
1643          }
1644          break;
1645 
1646       case nir_op_fcsel:
1647          /* NIR fcsel is src0 != 0 ? src1 : src2.
1648           * TGSI CMP is src0 < 0 ? src1 : src2.
1649           *
1650           * However, fcsel so far as I can find only appears on bools-as-floats
1651           * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
1652           * important to not have an abs here, as i915g has to make extra
1653           * instructions to do the abs.
1654           */
1655          if (c->options->lower_cmp) {
1656             /* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a
1657              * LRP on the boolean 1.0/0.0 value, instead of requiring the
1658              * backend to turn the src0 into 1.0/0.0 first.
1659              *
1660              * We don't use this in general because some hardware (i915 FS) the
1661              * LRP gets expanded to MUL/MAD.
1662              */
1663             ntt_LRP(c, dst, src[0], src[1], src[2]);
1664          } else {
1665             ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1666          }
1667          break;
1668 
1669          /* It would be nice if we could get this left as scalar in NIR, since
1670           * the TGSI op is scalar.
1671           */
1672       case nir_op_frexp_sig:
1673       case nir_op_frexp_exp: {
1674          assert(src_64);
1675          struct ureg_dst temp = ntt_temp(c);
1676 
1677          for (int chan = 0; chan < 2; chan++) {
1678             int wm = 1 << chan;
1679 
1680             if (!(instr->dest.write_mask & wm))
1681                continue;
1682 
1683             struct ureg_dst dsts[2] = { temp, temp };
1684             if (instr->op == nir_op_frexp_sig) {
1685                dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1686             } else {
1687                dsts[1] = ureg_writemask(dst, wm);
1688             }
1689 
1690             struct ureg_src chan_src = ureg_swizzle(src[0],
1691                                                     chan * 2, chan * 2 + 1,
1692                                                     chan * 2, chan * 2 + 1);
1693 
1694             struct ntt_insn *insn = ntt_insn(c, TGSI_OPCODE_DFRACEXP,
1695                                              dsts[0], chan_src,
1696                                              ureg_src_undef(),
1697                                              ureg_src_undef(),
1698                                              ureg_src_undef());
1699             insn->dst[1] = dsts[1];
1700          }
1701          break;
1702       }
1703 
1704       case nir_op_ldexp:
1705          assert(dst_64); /* 32bit handled in table. */
1706          ntt_DLDEXP(c, dst, src[0],
1707                      ureg_swizzle(src[1],
1708                                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1709                                   TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1710          break;
1711 
1712       case nir_op_vec4:
1713       case nir_op_vec3:
1714       case nir_op_vec2:
1715          unreachable("covered by nir_lower_vec_to_movs()");
1716 
1717       default:
1718          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1719          unreachable("Unknown NIR opcode");
1720       }
1721    }
1722 
1723    c->precise = false;
1724 }
1725 
1726 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)1727 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1728                       nir_src src, int addr_reg)
1729 {
1730    if (nir_src_is_const(src)) {
1731       usrc.Index += ntt_src_as_uint(c, src);
1732       return usrc;
1733    } else {
1734       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1735    }
1736 }
1737 
1738 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1739 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1740                       nir_src src)
1741 {
1742    if (nir_src_is_const(src)) {
1743       dst.Index += ntt_src_as_uint(c, src);
1744       return dst;
1745    } else {
1746       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1747    }
1748 }
1749 
1750 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1751 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1752                          nir_src src)
1753 {
1754    if (nir_src_is_const(src)) {
1755       return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1756    }
1757    else
1758    {
1759       return ureg_src_dimension_indirect(usrc,
1760                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1761                                          0);
1762    }
1763 }
1764 
1765 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1766 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1767                                 nir_src src)
1768 {
1769    if (nir_src_is_const(src)) {
1770       return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1771    } else {
1772       return ureg_dst_dimension_indirect(udst,
1773                                          ntt_reladdr(c, ntt_get_src(c, src), 1),
1774                                          0);
1775    }
1776 }
1777 /* Some load operations in NIR will have a fractional offset that we need to
1778  * swizzle down before storing to the result register.
1779  */
1780 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1781 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1782 {
1783    return ureg_swizzle(src,
1784                        frac,
1785                        frac + MIN2(num_components - 1, 1),
1786                        frac + MIN2(num_components - 1, 2),
1787                        frac + MIN2(num_components - 1, 3));
1788 }
1789 
1790 
1791 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1792 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1793 {
1794    int bit_size = nir_dest_bit_size(instr->dest);
1795    assert(bit_size == 32 || instr->num_components <= 2);
1796 
1797    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1798 
1799    struct ureg_dst addr_temp = ureg_dst_undef();
1800 
1801    if (nir_src_is_const(instr->src[0])) {
1802       src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1803    } else {
1804       /* virglrenderer requires that indirect UBO references have the UBO
1805        * array's base index in the Index field, not added to the indrect
1806        * address.
1807        *
1808        * Many nir intrinsics have a base address const value for the start of
1809        * their array indirection, but load_ubo doesn't.  We fake it by
1810        * subtracting it off here.
1811        */
1812       addr_temp = ntt_temp(c);
1813       ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1814       src = ureg_src_dimension_indirect(src,
1815                                          ntt_reladdr(c, ureg_src(addr_temp), 1),
1816                                          c->first_ubo);
1817    }
1818 
1819    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1820       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1821        * file.
1822        */
1823       src.Index = nir_intrinsic_base(instr);
1824 
1825       if (nir_src_is_const(instr->src[1])) {
1826          src.Index += ntt_src_as_uint(c, instr->src[1]);
1827       } else {
1828          src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1829       }
1830 
1831       int start_component = nir_intrinsic_component(instr);
1832       if (bit_size == 64)
1833          start_component *= 2;
1834 
1835       src = ntt_shift_by_frac(src, start_component,
1836                               instr->num_components * bit_size / 32);
1837 
1838       ntt_store(c, &instr->dest, src);
1839    } else {
1840       /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1841        * TGSI_OPCODE_LOAD instruction from the const file.
1842        */
1843       struct ntt_insn *insn =
1844          ntt_insn(c, TGSI_OPCODE_LOAD,
1845                   ntt_get_dest(c, &instr->dest),
1846                   src, ntt_get_src(c, instr->src[1]),
1847                   ureg_src_undef(), ureg_src_undef());
1848       insn->is_mem = true;
1849       insn->tex_target = 0;
1850       insn->mem_qualifier = 0;
1851       insn->mem_format = 0; /* unused */
1852    }
1853 }
1854 
1855 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1856 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1857 {
1858    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1859    unsigned qualifier = 0;
1860 
1861    if (access & ACCESS_COHERENT)
1862       qualifier |= TGSI_MEMORY_COHERENT;
1863    if (access & ACCESS_VOLATILE)
1864       qualifier |= TGSI_MEMORY_VOLATILE;
1865    if (access & ACCESS_RESTRICT)
1866       qualifier |= TGSI_MEMORY_RESTRICT;
1867 
1868    return qualifier;
1869 }
1870 
1871 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1872 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1873              nir_variable_mode mode)
1874 {
1875    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1876                     instr->intrinsic == nir_intrinsic_store_shared);
1877    bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1878                     instr->intrinsic == nir_intrinsic_load_ssbo ||
1879                     instr->intrinsic == nir_intrinsic_load_shared);
1880    unsigned opcode;
1881    struct ureg_src src[4];
1882    int num_src = 0;
1883    int next_src;
1884    struct ureg_dst addr_temp = ureg_dst_undef();
1885 
1886    struct ureg_src memory;
1887    switch (mode) {
1888    case nir_var_mem_ssbo:
1889       memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1890                                      instr->src[is_store ? 1 : 0], 2);
1891       next_src = 1;
1892       break;
1893    case nir_var_mem_shared:
1894       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1895       next_src = 0;
1896       break;
1897    case nir_var_uniform: { /* HW atomic buffers */
1898       nir_src src = instr->src[0];
1899       uint32_t offset = ntt_extract_const_src_offset(&src) / 4;
1900       memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
1901       /* ntt_ureg_src_indirect, except dividing by 4 */
1902       if (nir_src_is_const(src)) {
1903          memory.Index += nir_src_as_uint(src) / 4;
1904       } else {
1905          addr_temp = ntt_temp(c);
1906          ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
1907          memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
1908       }
1909       memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1910       next_src = 0;
1911       break;
1912    }
1913 
1914    default:
1915       unreachable("unknown memory type");
1916    }
1917 
1918    if (is_store) {
1919       src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
1920       src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1921    } else {
1922       src[num_src++] = memory;
1923       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1924          src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
1925          switch (instr->intrinsic) {
1926          case nir_intrinsic_atomic_counter_inc:
1927             src[num_src++] = ureg_imm1i(c->ureg, 1);
1928             break;
1929          case nir_intrinsic_atomic_counter_post_dec:
1930             src[num_src++] = ureg_imm1i(c->ureg, -1);
1931             break;
1932          default:
1933             if (!is_load)
1934                src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
1935             break;
1936          }
1937       }
1938    }
1939 
1940 
1941    switch (instr->intrinsic) {
1942    case nir_intrinsic_atomic_counter_add:
1943    case nir_intrinsic_atomic_counter_inc:
1944    case nir_intrinsic_atomic_counter_post_dec:
1945    case nir_intrinsic_ssbo_atomic_add:
1946    case nir_intrinsic_shared_atomic_add:
1947       opcode = TGSI_OPCODE_ATOMUADD;
1948       break;
1949    case nir_intrinsic_ssbo_atomic_fadd:
1950    case nir_intrinsic_shared_atomic_fadd:
1951       opcode = TGSI_OPCODE_ATOMFADD;
1952       break;
1953    case nir_intrinsic_atomic_counter_min:
1954    case nir_intrinsic_ssbo_atomic_imin:
1955    case nir_intrinsic_shared_atomic_imin:
1956       opcode = TGSI_OPCODE_ATOMIMIN;
1957       break;
1958    case nir_intrinsic_atomic_counter_max:
1959    case nir_intrinsic_ssbo_atomic_imax:
1960    case nir_intrinsic_shared_atomic_imax:
1961       opcode = TGSI_OPCODE_ATOMIMAX;
1962       break;
1963    case nir_intrinsic_ssbo_atomic_umin:
1964    case nir_intrinsic_shared_atomic_umin:
1965       opcode = TGSI_OPCODE_ATOMUMIN;
1966       break;
1967    case nir_intrinsic_ssbo_atomic_umax:
1968    case nir_intrinsic_shared_atomic_umax:
1969       opcode = TGSI_OPCODE_ATOMUMAX;
1970       break;
1971    case nir_intrinsic_atomic_counter_and:
1972    case nir_intrinsic_ssbo_atomic_and:
1973    case nir_intrinsic_shared_atomic_and:
1974       opcode = TGSI_OPCODE_ATOMAND;
1975       break;
1976    case nir_intrinsic_atomic_counter_or:
1977    case nir_intrinsic_ssbo_atomic_or:
1978    case nir_intrinsic_shared_atomic_or:
1979       opcode = TGSI_OPCODE_ATOMOR;
1980       break;
1981    case nir_intrinsic_atomic_counter_xor:
1982    case nir_intrinsic_ssbo_atomic_xor:
1983    case nir_intrinsic_shared_atomic_xor:
1984       opcode = TGSI_OPCODE_ATOMXOR;
1985       break;
1986    case nir_intrinsic_atomic_counter_exchange:
1987    case nir_intrinsic_ssbo_atomic_exchange:
1988    case nir_intrinsic_shared_atomic_exchange:
1989       opcode = TGSI_OPCODE_ATOMXCHG;
1990       break;
1991    case nir_intrinsic_atomic_counter_comp_swap:
1992    case nir_intrinsic_ssbo_atomic_comp_swap:
1993    case nir_intrinsic_shared_atomic_comp_swap:
1994       opcode = TGSI_OPCODE_ATOMCAS;
1995       src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
1996       break;
1997    case nir_intrinsic_atomic_counter_read:
1998    case nir_intrinsic_load_ssbo:
1999    case nir_intrinsic_load_shared:
2000       opcode = TGSI_OPCODE_LOAD;
2001       break;
2002    case nir_intrinsic_store_ssbo:
2003    case nir_intrinsic_store_shared:
2004       opcode = TGSI_OPCODE_STORE;
2005       break;
2006    case nir_intrinsic_get_ssbo_size:
2007       opcode = TGSI_OPCODE_RESQ;
2008       break;
2009    default:
2010       unreachable("unknown memory op");
2011    }
2012 
2013    unsigned qualifier = 0;
2014    if (mode == nir_var_mem_ssbo &&
2015        instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2016       qualifier = ntt_get_access_qualifier(instr);
2017    }
2018 
2019    struct ureg_dst dst;
2020    if (is_store) {
2021       dst = ureg_dst(memory);
2022 
2023       unsigned write_mask = nir_intrinsic_write_mask(instr);
2024       if (nir_src_bit_size(instr->src[0]) == 64)
2025          write_mask = ntt_64bit_write_mask(write_mask);
2026       dst = ureg_writemask(dst, write_mask);
2027    } else {
2028       dst = ntt_get_dest(c, &instr->dest);
2029    }
2030 
2031    struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2032    insn->tex_target = TGSI_TEXTURE_BUFFER;
2033    insn->mem_qualifier = qualifier;
2034    insn->mem_format = 0; /* unused */
2035    insn->is_mem = true;
2036 }
2037 
2038 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)2039 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2040 {
2041    unsigned op;
2042    struct ureg_src srcs[4];
2043    int num_src = 0;
2044    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2045    bool is_array = nir_intrinsic_image_array(instr);
2046 
2047    struct ureg_dst temp = ureg_dst_undef();
2048 
2049    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2050 
2051    struct ureg_src resource =
2052       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2053                             instr->src[0], 2);
2054 
2055    struct ureg_dst dst;
2056    if (instr->intrinsic == nir_intrinsic_image_store) {
2057       dst = ureg_dst(resource);
2058    } else {
2059       srcs[num_src++] = resource;
2060       dst = ntt_get_dest(c, &instr->dest);
2061    }
2062    struct ureg_dst opcode_dst = dst;
2063 
2064    if (instr->intrinsic != nir_intrinsic_image_size && instr->intrinsic != nir_intrinsic_image_samples) {
2065       struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2066 
2067       if (dim == GLSL_SAMPLER_DIM_MS) {
2068          temp = ntt_temp(c);
2069          ntt_MOV(c, temp, coord);
2070          ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2071                   ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2072          coord = ureg_src(temp);
2073       }
2074       srcs[num_src++] = coord;
2075 
2076       if (instr->intrinsic != nir_intrinsic_image_load) {
2077          srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2078          if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
2079             srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2080       }
2081    }
2082 
2083    switch (instr->intrinsic) {
2084    case nir_intrinsic_image_load:
2085       op = TGSI_OPCODE_LOAD;
2086       break;
2087    case nir_intrinsic_image_store:
2088       op = TGSI_OPCODE_STORE;
2089       break;
2090    case nir_intrinsic_image_size:
2091       op = TGSI_OPCODE_RESQ;
2092       break;
2093    case nir_intrinsic_image_samples:
2094       op = TGSI_OPCODE_RESQ;
2095       opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2096       break;
2097    case nir_intrinsic_image_atomic_add:
2098       op = TGSI_OPCODE_ATOMUADD;
2099       break;
2100    case nir_intrinsic_image_atomic_fadd:
2101       op = TGSI_OPCODE_ATOMFADD;
2102       break;
2103    case nir_intrinsic_image_atomic_imin:
2104       op = TGSI_OPCODE_ATOMIMIN;
2105       break;
2106    case nir_intrinsic_image_atomic_umin:
2107       op = TGSI_OPCODE_ATOMUMIN;
2108       break;
2109    case nir_intrinsic_image_atomic_imax:
2110       op = TGSI_OPCODE_ATOMIMAX;
2111       break;
2112    case nir_intrinsic_image_atomic_umax:
2113       op = TGSI_OPCODE_ATOMUMAX;
2114       break;
2115    case nir_intrinsic_image_atomic_and:
2116       op = TGSI_OPCODE_ATOMAND;
2117       break;
2118    case nir_intrinsic_image_atomic_or:
2119       op = TGSI_OPCODE_ATOMOR;
2120       break;
2121    case nir_intrinsic_image_atomic_xor:
2122       op = TGSI_OPCODE_ATOMXOR;
2123       break;
2124    case nir_intrinsic_image_atomic_exchange:
2125       op = TGSI_OPCODE_ATOMXCHG;
2126       break;
2127    case nir_intrinsic_image_atomic_comp_swap:
2128       op = TGSI_OPCODE_ATOMCAS;
2129       break;
2130    default:
2131       unreachable("bad op");
2132    }
2133 
2134    struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2135    insn->tex_target = target;
2136    insn->mem_qualifier = ntt_get_access_qualifier(instr);
2137    insn->mem_format = nir_intrinsic_format(instr);
2138    insn->is_mem = true;
2139 
2140    if (instr->intrinsic == nir_intrinsic_image_samples)
2141       ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2142 }
2143 
2144 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)2145 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2146 {
2147    uint32_t frac = nir_intrinsic_component(instr);
2148    uint32_t num_components = instr->num_components;
2149    unsigned base = nir_intrinsic_base(instr);
2150    struct ureg_src input;
2151    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2152    bool is_64 = nir_dest_bit_size(instr->dest) == 64;
2153 
2154    if (c->s->info.stage == MESA_SHADER_VERTEX) {
2155       input = ureg_DECL_vs_input(c->ureg, base);
2156       for (int i = 1; i < semantics.num_slots; i++)
2157          ureg_DECL_vs_input(c->ureg, base + i);
2158    } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2159       unsigned semantic_name, semantic_index;
2160       ntt_get_gl_varying_semantic(c, semantics.location,
2161                                   &semantic_name, &semantic_index);
2162 
2163       /* XXX: ArrayID is used in r600 gs inputs */
2164       uint32_t array_id = 0;
2165 
2166       input = ureg_DECL_input_layout(c->ureg,
2167                                      semantic_name,
2168                                      semantic_index,
2169                                      base,
2170                                      ntt_tgsi_usage_mask(frac,
2171                                                          instr->num_components,
2172                                                          is_64),
2173                                      array_id,
2174                                      semantics.num_slots);
2175    } else {
2176       input = c->input_index_map[base];
2177    }
2178 
2179    if (is_64)
2180       num_components *= 2;
2181 
2182    input = ntt_shift_by_frac(input, frac, num_components);
2183 
2184    switch (instr->intrinsic) {
2185    case nir_intrinsic_load_input:
2186       input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2187       ntt_store(c, &instr->dest, input);
2188       break;
2189 
2190    case nir_intrinsic_load_per_vertex_input:
2191       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2192       input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2193       ntt_store(c, &instr->dest, input);
2194       break;
2195 
2196    case nir_intrinsic_load_interpolated_input: {
2197       input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2198 
2199       nir_intrinsic_instr *bary_instr =
2200          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2201 
2202       switch (bary_instr->intrinsic) {
2203       case nir_intrinsic_load_barycentric_pixel:
2204       case nir_intrinsic_load_barycentric_sample:
2205          /* For these, we know that the barycentric load matches the
2206           * interpolation on the input declaration, so we can use it directly.
2207           */
2208          ntt_store(c, &instr->dest, input);
2209          break;
2210 
2211       case nir_intrinsic_load_barycentric_centroid:
2212          /* If the input was declared centroid, then there's no need to
2213           * emit the extra TGSI interp instruction, we can just read the
2214           * input.
2215           */
2216          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2217             ntt_store(c, &instr->dest, input);
2218          } else {
2219             ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->dest), input);
2220          }
2221          break;
2222 
2223       case nir_intrinsic_load_barycentric_at_sample:
2224          /* We stored the sample in the fake "bary" dest. */
2225          ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->dest), input,
2226                             ntt_get_src(c, instr->src[0]));
2227          break;
2228 
2229       case nir_intrinsic_load_barycentric_at_offset:
2230          /* We stored the offset in the fake "bary" dest. */
2231          ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->dest), input,
2232                             ntt_get_src(c, instr->src[0]));
2233          break;
2234 
2235       default:
2236          unreachable("bad barycentric interp intrinsic\n");
2237       }
2238       break;
2239    }
2240 
2241    default:
2242       unreachable("bad load input intrinsic\n");
2243    }
2244 }
2245 
2246 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2247 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2248 {
2249    struct ureg_src src = ntt_get_src(c, instr->src[0]);
2250 
2251    if (src.File == TGSI_FILE_OUTPUT) {
2252       /* If our src is the output file, that's an indication that we were able
2253        * to emit the output stores in the generating instructions and we have
2254        * nothing to do here.
2255        */
2256       return;
2257    }
2258 
2259    uint32_t frac;
2260    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2261 
2262    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2263       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2264       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2265    } else {
2266       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2267    }
2268 
2269    uint8_t swizzle[4] = { 0, 0, 0, 0 };
2270    for (int i = frac; i <= 4; i++) {
2271       if (out.WriteMask & (1 << i))
2272          swizzle[i] = i - frac;
2273    }
2274 
2275    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2276 
2277    ntt_MOV(c, out, src);
2278 }
2279 
2280 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)2281 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2282 {
2283    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2284 
2285    /* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2286     * load_output is present.
2287     */
2288    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2289           (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2290 
2291    uint32_t frac;
2292    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2293 
2294    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2295       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2296       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2297    } else {
2298       out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2299    }
2300 
2301    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2302    struct ureg_src out_src = ureg_src(out);
2303 
2304    /* Don't swizzling unavailable channels of the output in the writemasked-out
2305     * components. Avoids compile failures in virglrenderer with
2306     * TESS_LEVEL_INNER.
2307     */
2308    int fill_channel = ffs(dst.WriteMask) - 1;
2309    uint8_t swizzles[4] = { 0, 1, 2, 3 };
2310    for (int i = 0; i < 4; i++)
2311       if (!(dst.WriteMask & (1 << i)))
2312          swizzles[i] = fill_channel;
2313    out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
2314 
2315    if (semantics.fb_fetch_output)
2316       ntt_FBFETCH(c, dst, out_src);
2317    else
2318       ntt_MOV(c, dst, out_src);
2319 }
2320 
2321 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)2322 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2323 {
2324    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2325    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2326    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2327 
2328    /* virglrenderer doesn't like references to channels of the sysval that
2329     * aren't defined, even if they aren't really read.  (GLSL compile fails on
2330     * gl_NumWorkGroups.w, for example).
2331     */
2332    uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
2333    sv = ntt_swizzle_for_write_mask(sv, write_mask);
2334 
2335    /* TGSI and NIR define these intrinsics as always loading ints, but they can
2336     * still appear on hardware with non-native-integers fragment shaders using
2337     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
2338     * means that we actually want floats instead.
2339     */
2340    if (!c->native_integers) {
2341       switch (instr->intrinsic) {
2342       case nir_intrinsic_load_vertex_id:
2343       case nir_intrinsic_load_instance_id:
2344          ntt_U2F(c, ntt_get_dest(c, &instr->dest), sv);
2345          return;
2346 
2347       default:
2348          break;
2349       }
2350    }
2351 
2352    ntt_store(c, &instr->dest, sv);
2353 }
2354 
2355 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)2356 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2357 {
2358    switch (instr->intrinsic) {
2359    case nir_intrinsic_load_ubo:
2360    case nir_intrinsic_load_ubo_vec4:
2361       ntt_emit_load_ubo(c, instr);
2362       break;
2363 
2364       /* Vertex */
2365    case nir_intrinsic_load_vertex_id:
2366    case nir_intrinsic_load_vertex_id_zero_base:
2367    case nir_intrinsic_load_base_vertex:
2368    case nir_intrinsic_load_base_instance:
2369    case nir_intrinsic_load_instance_id:
2370    case nir_intrinsic_load_draw_id:
2371    case nir_intrinsic_load_invocation_id:
2372    case nir_intrinsic_load_frag_coord:
2373    case nir_intrinsic_load_point_coord:
2374    case nir_intrinsic_load_front_face:
2375    case nir_intrinsic_load_sample_id:
2376    case nir_intrinsic_load_sample_pos:
2377    case nir_intrinsic_load_sample_mask_in:
2378    case nir_intrinsic_load_helper_invocation:
2379    case nir_intrinsic_load_tess_coord:
2380    case nir_intrinsic_load_patch_vertices_in:
2381    case nir_intrinsic_load_primitive_id:
2382    case nir_intrinsic_load_tess_level_outer:
2383    case nir_intrinsic_load_tess_level_inner:
2384    case nir_intrinsic_load_local_invocation_id:
2385    case nir_intrinsic_load_workgroup_id:
2386    case nir_intrinsic_load_num_workgroups:
2387    case nir_intrinsic_load_workgroup_size:
2388    case nir_intrinsic_load_subgroup_size:
2389    case nir_intrinsic_load_subgroup_invocation:
2390    case nir_intrinsic_load_subgroup_eq_mask:
2391    case nir_intrinsic_load_subgroup_ge_mask:
2392    case nir_intrinsic_load_subgroup_gt_mask:
2393    case nir_intrinsic_load_subgroup_lt_mask:
2394       ntt_emit_load_sysval(c, instr);
2395       break;
2396 
2397    case nir_intrinsic_load_input:
2398    case nir_intrinsic_load_per_vertex_input:
2399    case nir_intrinsic_load_interpolated_input:
2400       ntt_emit_load_input(c, instr);
2401       break;
2402 
2403    case nir_intrinsic_store_output:
2404    case nir_intrinsic_store_per_vertex_output:
2405       ntt_emit_store_output(c, instr);
2406       break;
2407 
2408    case nir_intrinsic_load_output:
2409    case nir_intrinsic_load_per_vertex_output:
2410       ntt_emit_load_output(c, instr);
2411       break;
2412 
2413    case nir_intrinsic_discard:
2414       ntt_KILL(c);
2415       break;
2416 
2417    case nir_intrinsic_discard_if: {
2418       struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2419 
2420       if (c->native_integers) {
2421          struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2422          ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2423          ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2424       } else {
2425          /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2426          ntt_KILL_IF(c, ureg_negate(cond));
2427       }
2428       break;
2429    }
2430 
2431    case nir_intrinsic_load_ssbo:
2432    case nir_intrinsic_store_ssbo:
2433    case nir_intrinsic_ssbo_atomic_add:
2434    case nir_intrinsic_ssbo_atomic_fadd:
2435    case nir_intrinsic_ssbo_atomic_imin:
2436    case nir_intrinsic_ssbo_atomic_imax:
2437    case nir_intrinsic_ssbo_atomic_umin:
2438    case nir_intrinsic_ssbo_atomic_umax:
2439    case nir_intrinsic_ssbo_atomic_and:
2440    case nir_intrinsic_ssbo_atomic_or:
2441    case nir_intrinsic_ssbo_atomic_xor:
2442    case nir_intrinsic_ssbo_atomic_exchange:
2443    case nir_intrinsic_ssbo_atomic_comp_swap:
2444    case nir_intrinsic_get_ssbo_size:
2445       ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2446       break;
2447 
2448    case nir_intrinsic_load_shared:
2449    case nir_intrinsic_store_shared:
2450    case nir_intrinsic_shared_atomic_add:
2451    case nir_intrinsic_shared_atomic_fadd:
2452    case nir_intrinsic_shared_atomic_imin:
2453    case nir_intrinsic_shared_atomic_imax:
2454    case nir_intrinsic_shared_atomic_umin:
2455    case nir_intrinsic_shared_atomic_umax:
2456    case nir_intrinsic_shared_atomic_and:
2457    case nir_intrinsic_shared_atomic_or:
2458    case nir_intrinsic_shared_atomic_xor:
2459    case nir_intrinsic_shared_atomic_exchange:
2460    case nir_intrinsic_shared_atomic_comp_swap:
2461       ntt_emit_mem(c, instr, nir_var_mem_shared);
2462       break;
2463 
2464    case nir_intrinsic_atomic_counter_read:
2465    case nir_intrinsic_atomic_counter_add:
2466    case nir_intrinsic_atomic_counter_inc:
2467    case nir_intrinsic_atomic_counter_post_dec:
2468    case nir_intrinsic_atomic_counter_min:
2469    case nir_intrinsic_atomic_counter_max:
2470    case nir_intrinsic_atomic_counter_and:
2471    case nir_intrinsic_atomic_counter_or:
2472    case nir_intrinsic_atomic_counter_xor:
2473    case nir_intrinsic_atomic_counter_exchange:
2474    case nir_intrinsic_atomic_counter_comp_swap:
2475       ntt_emit_mem(c, instr, nir_var_uniform);
2476       break;
2477    case nir_intrinsic_atomic_counter_pre_dec:
2478       unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2479       break;
2480 
2481    case nir_intrinsic_image_load:
2482    case nir_intrinsic_image_store:
2483    case nir_intrinsic_image_size:
2484    case nir_intrinsic_image_samples:
2485    case nir_intrinsic_image_atomic_add:
2486    case nir_intrinsic_image_atomic_fadd:
2487    case nir_intrinsic_image_atomic_imin:
2488    case nir_intrinsic_image_atomic_umin:
2489    case nir_intrinsic_image_atomic_imax:
2490    case nir_intrinsic_image_atomic_umax:
2491    case nir_intrinsic_image_atomic_and:
2492    case nir_intrinsic_image_atomic_or:
2493    case nir_intrinsic_image_atomic_xor:
2494    case nir_intrinsic_image_atomic_exchange:
2495    case nir_intrinsic_image_atomic_comp_swap:
2496       ntt_emit_image_load_store(c, instr);
2497       break;
2498 
2499    case nir_intrinsic_control_barrier:
2500    case nir_intrinsic_memory_barrier_tcs_patch:
2501       ntt_BARRIER(c);
2502       break;
2503 
2504    case nir_intrinsic_memory_barrier:
2505       ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2506                                TGSI_MEMBAR_SHADER_BUFFER |
2507                                TGSI_MEMBAR_ATOMIC_BUFFER |
2508                                TGSI_MEMBAR_SHADER_IMAGE |
2509                                TGSI_MEMBAR_SHARED));
2510       break;
2511 
2512    case nir_intrinsic_memory_barrier_atomic_counter:
2513       ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2514       break;
2515 
2516    case nir_intrinsic_memory_barrier_buffer:
2517       ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2518       break;
2519 
2520    case nir_intrinsic_memory_barrier_image:
2521       ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2522       break;
2523 
2524    case nir_intrinsic_memory_barrier_shared:
2525       ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2526       break;
2527 
2528    case nir_intrinsic_group_memory_barrier:
2529       ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2530                                TGSI_MEMBAR_SHADER_BUFFER |
2531                                TGSI_MEMBAR_ATOMIC_BUFFER |
2532                                TGSI_MEMBAR_SHADER_IMAGE |
2533                                TGSI_MEMBAR_SHARED |
2534                                TGSI_MEMBAR_THREAD_GROUP));
2535       break;
2536 
2537    case nir_intrinsic_end_primitive:
2538       ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2539       break;
2540 
2541    case nir_intrinsic_emit_vertex:
2542       ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2543       break;
2544 
2545       /* In TGSI we don't actually generate the barycentric coords, and emit
2546        * interp intrinsics later.  However, we do need to store the
2547        * load_barycentric_at_* argument so that we can use it at that point.
2548        */
2549    case nir_intrinsic_load_barycentric_pixel:
2550    case nir_intrinsic_load_barycentric_centroid:
2551    case nir_intrinsic_load_barycentric_sample:
2552       break;
2553    case nir_intrinsic_load_barycentric_at_sample:
2554    case nir_intrinsic_load_barycentric_at_offset:
2555       ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2556       break;
2557 
2558    case nir_intrinsic_shader_clock:
2559       ntt_CLOCK(c, ntt_get_dest(c, &instr->dest));
2560       break;
2561 
2562    default:
2563       fprintf(stderr, "Unknown intrinsic: ");
2564       nir_print_instr(&instr->instr, stderr);
2565       fprintf(stderr, "\n");
2566       break;
2567    }
2568 }
2569 
2570 struct ntt_tex_operand_state {
2571    struct ureg_src srcs[4];
2572    unsigned i;
2573 };
2574 
2575 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2576 ntt_push_tex_arg(struct ntt_compile *c,
2577                  nir_tex_instr *instr,
2578                  nir_tex_src_type tex_src_type,
2579                  struct ntt_tex_operand_state *s)
2580 {
2581    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2582    if (tex_src < 0)
2583       return;
2584 
2585    nir_src *src = &instr->src[tex_src].src;
2586 
2587    /* virglrenderer workaround that's hard to do in tgsi_translate: Make sure
2588     * that TG4's immediate offset arg is float-typed.
2589     */
2590    if (instr->op == nir_texop_tg4 && tex_src_type == nir_tex_src_backend2 &&
2591        nir_src_is_const(*src)) {
2592       nir_const_value *consts = nir_src_as_const_value(*src);
2593       s->srcs[s->i++] = ureg_imm4f(c->ureg,
2594                                    consts[0].f32,
2595                                    consts[1].f32,
2596                                    consts[2].f32,
2597                                    consts[3].f32);
2598       return;
2599    }
2600 
2601    s->srcs[s->i++] = ntt_get_src(c, *src);
2602 }
2603 
2604 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2605 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2606 {
2607    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2608    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2609    unsigned tex_opcode;
2610 
2611    struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2612    int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2613    if (sampler_src >= 0) {
2614       struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2615       sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2616    }
2617 
2618    switch (instr->op) {
2619    case nir_texop_tex:
2620       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2621          MAX2(instr->coord_components, 2) + instr->is_shadow)
2622          tex_opcode = TGSI_OPCODE_TXP;
2623       else
2624          tex_opcode = TGSI_OPCODE_TEX;
2625       break;
2626    case nir_texop_txf:
2627    case nir_texop_txf_ms:
2628       tex_opcode = TGSI_OPCODE_TXF;
2629 
2630       if (c->has_txf_lz) {
2631          int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2632          if (lod_src >= 0 &&
2633              nir_src_is_const(instr->src[lod_src].src) &&
2634              ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2635             tex_opcode = TGSI_OPCODE_TXF_LZ;
2636          }
2637       }
2638       break;
2639    case nir_texop_txl:
2640       tex_opcode = TGSI_OPCODE_TXL;
2641       break;
2642    case nir_texop_txb:
2643       tex_opcode = TGSI_OPCODE_TXB;
2644       break;
2645    case nir_texop_txd:
2646       tex_opcode = TGSI_OPCODE_TXD;
2647       break;
2648    case nir_texop_txs:
2649       tex_opcode = TGSI_OPCODE_TXQ;
2650       break;
2651    case nir_texop_tg4:
2652       tex_opcode = TGSI_OPCODE_TG4;
2653       break;
2654    case nir_texop_query_levels:
2655       tex_opcode = TGSI_OPCODE_TXQ;
2656       break;
2657    case nir_texop_lod:
2658       tex_opcode = TGSI_OPCODE_LODQ;
2659       break;
2660    case nir_texop_texture_samples:
2661       tex_opcode = TGSI_OPCODE_TXQS;
2662       break;
2663    default:
2664       unreachable("unsupported tex op");
2665    }
2666 
2667    struct ntt_tex_operand_state s = { .i = 0 };
2668    ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2669    ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2670 
2671    /* non-coord arg for TXQ */
2672    if (tex_opcode == TGSI_OPCODE_TXQ) {
2673       ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2674       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2675        * scalar
2676        */
2677       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2678    }
2679 
2680    if (s.i > 1) {
2681       if (tex_opcode == TGSI_OPCODE_TEX)
2682          tex_opcode = TGSI_OPCODE_TEX2;
2683       if (tex_opcode == TGSI_OPCODE_TXB)
2684          tex_opcode = TGSI_OPCODE_TXB2;
2685       if (tex_opcode == TGSI_OPCODE_TXL)
2686          tex_opcode = TGSI_OPCODE_TXL2;
2687    }
2688 
2689    if (instr->op == nir_texop_txd) {
2690       /* Derivs appear in their own src args */
2691       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2692       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2693       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2694       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2695    }
2696 
2697    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2698       if (c->screen->get_param(c->screen,
2699                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2700          sampler = ureg_scalar(sampler, instr->component);
2701          s.srcs[s.i++] = ureg_src_undef();
2702       } else {
2703          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2704       }
2705    }
2706 
2707    s.srcs[s.i++] = sampler;
2708 
2709    enum tgsi_return_type tex_type;
2710    switch (instr->dest_type) {
2711    case nir_type_float32:
2712       tex_type = TGSI_RETURN_TYPE_FLOAT;
2713       break;
2714    case nir_type_int32:
2715       tex_type = TGSI_RETURN_TYPE_SINT;
2716       break;
2717    case nir_type_uint32:
2718       tex_type = TGSI_RETURN_TYPE_UINT;
2719       break;
2720    default:
2721       unreachable("unknown texture type");
2722    }
2723 
2724    struct tgsi_texture_offset tex_offset = {
2725       .File = TGSI_FILE_NULL
2726    };
2727    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2728    if (tex_offset_src >= 0) {
2729       struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2730 
2731       tex_offset.File = offset.File;
2732       tex_offset.Index = offset.Index;
2733       tex_offset.SwizzleX = offset.SwizzleX;
2734       tex_offset.SwizzleY = offset.SwizzleY;
2735       tex_offset.SwizzleZ = offset.SwizzleZ;
2736       tex_offset.Padding = 0;
2737    }
2738 
2739    struct ureg_dst tex_dst;
2740    if (instr->op == nir_texop_query_levels)
2741       tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2742    else
2743       tex_dst = dst;
2744 
2745    while (s.i < 4)
2746       s.srcs[s.i++] = ureg_src_undef();
2747 
2748    struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2749    insn->tex_target = target;
2750    insn->tex_return_type = tex_type;
2751    insn->tex_offset = tex_offset;
2752    insn->is_tex = true;
2753 
2754    if (instr->op == nir_texop_query_levels)
2755       ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2756 }
2757 
2758 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2759 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2760 {
2761    switch (jump->type) {
2762    case nir_jump_break:
2763       ntt_BRK(c);
2764       break;
2765 
2766    case nir_jump_continue:
2767       ntt_CONT(c);
2768       break;
2769 
2770    default:
2771       fprintf(stderr, "Unknown jump instruction: ");
2772       nir_print_instr(&jump->instr, stderr);
2773       fprintf(stderr, "\n");
2774       abort();
2775    }
2776 }
2777 
2778 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)2779 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2780 {
2781    /* Nothing to do but make sure that we have some storage to deref. */
2782    (void)ntt_get_ssa_def_decl(c, &instr->def);
2783 }
2784 
2785 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2786 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2787 {
2788    switch (instr->type) {
2789    case nir_instr_type_deref:
2790       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2791       break;
2792 
2793    case nir_instr_type_alu:
2794       ntt_emit_alu(c, nir_instr_as_alu(instr));
2795       break;
2796 
2797    case nir_instr_type_intrinsic:
2798       ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2799       break;
2800 
2801    case nir_instr_type_load_const:
2802       /* Nothing to do here, as load consts are done directly from
2803        * ntt_get_src() (since many constant NIR srcs will often get folded
2804        * directly into a register file index instead of as a TGSI src).
2805        */
2806       break;
2807 
2808    case nir_instr_type_tex:
2809       ntt_emit_texture(c, nir_instr_as_tex(instr));
2810       break;
2811 
2812    case nir_instr_type_jump:
2813       ntt_emit_jump(c, nir_instr_as_jump(instr));
2814       break;
2815 
2816    case nir_instr_type_ssa_undef:
2817       ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2818       break;
2819 
2820    default:
2821       fprintf(stderr, "Unknown NIR instr type: ");
2822       nir_print_instr(instr, stderr);
2823       fprintf(stderr, "\n");
2824       abort();
2825    }
2826 }
2827 
2828 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2829 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2830 {
2831    if (c->native_integers)
2832       ntt_UIF(c, c->if_cond);
2833    else
2834       ntt_IF(c, c->if_cond);
2835 
2836    ntt_emit_cf_list(c, &if_stmt->then_list);
2837 
2838    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2839       ntt_ELSE(c);
2840       ntt_emit_cf_list(c, &if_stmt->else_list);
2841    }
2842 
2843    ntt_ENDIF(c);
2844 }
2845 
2846 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2847 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2848 {
2849    ntt_BGNLOOP(c);
2850    ntt_emit_cf_list(c, &loop->body);
2851    ntt_ENDLOOP(c);
2852 }
2853 
2854 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2855 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2856 {
2857    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2858    c->cur_block = ntt_block;
2859 
2860    nir_foreach_instr(instr, block) {
2861       ntt_emit_instr(c, instr);
2862 
2863       /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
2864       if (ureg_get_instruction_number(c->ureg) != 0) {
2865          fprintf(stderr, "Emitted ureg insn during: ");
2866          nir_print_instr(instr, stderr);
2867          fprintf(stderr, "\n");
2868          unreachable("emitted ureg insn");
2869       }
2870    }
2871 
2872    /* Set up the if condition for ntt_emit_if(), which we have to do before
2873     * freeing up the temps (the "if" is treated as inside the block for liveness
2874     * purposes, despite not being an instruction)
2875     *
2876     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2877     * looks at all of .xyzw.  No harm in working around the bug.
2878     */
2879    nir_if *nif = nir_block_get_following_if(block);
2880    if (nif)
2881       c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2882 }
2883 
2884 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2885 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2886 {
2887    foreach_list_typed(nir_cf_node, node, node, list) {
2888       switch (node->type) {
2889       case nir_cf_node_block:
2890          ntt_emit_block(c, nir_cf_node_as_block(node));
2891          break;
2892 
2893       case nir_cf_node_if:
2894          ntt_emit_if(c, nir_cf_node_as_if(node));
2895          break;
2896 
2897       case nir_cf_node_loop:
2898          ntt_emit_loop(c, nir_cf_node_as_loop(node));
2899          break;
2900 
2901       default:
2902          unreachable("unknown CF type");
2903       }
2904    }
2905 }
2906 
2907 static void
ntt_emit_block_ureg(struct ntt_compile * c,struct nir_block * block)2908 ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
2909 {
2910    struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2911 
2912    /* Emit the ntt insns to tgsi_ureg. */
2913    util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
2914       const struct tgsi_opcode_info *opcode_info =
2915          tgsi_get_opcode_info(insn->opcode);
2916 
2917       switch (insn->opcode) {
2918       case TGSI_OPCODE_UIF:
2919          ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
2920          break;
2921 
2922       case TGSI_OPCODE_IF:
2923          ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2924          break;
2925 
2926       case TGSI_OPCODE_ELSE:
2927          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2928          ureg_ELSE(c->ureg, &c->cf_label);
2929          c->current_if_else = c->cf_label;
2930          break;
2931 
2932       case TGSI_OPCODE_ENDIF:
2933          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2934          ureg_ENDIF(c->ureg);
2935          break;
2936 
2937       case TGSI_OPCODE_BGNLOOP:
2938          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2939           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2940           * with a need.
2941           */
2942          ureg_BGNLOOP(c->ureg, &c->cf_label);
2943          break;
2944 
2945       case TGSI_OPCODE_ENDLOOP:
2946          ureg_ENDLOOP(c->ureg, &c->cf_label);
2947          break;
2948 
2949       default:
2950          if (insn->is_tex) {
2951             ureg_tex_insn(c->ureg, insn->opcode,
2952                           insn->dst, opcode_info->num_dst,
2953                           insn->tex_target, insn->tex_return_type,
2954                           &insn->tex_offset,
2955                           insn->tex_offset.File != TGSI_FILE_NULL ? 1 : 0,
2956                           insn->src, opcode_info->num_src);
2957          } else if (insn->is_mem) {
2958             ureg_memory_insn(c->ureg, insn->opcode,
2959                              insn->dst, opcode_info->num_dst,
2960                              insn->src, opcode_info->num_src,
2961                              insn->mem_qualifier,
2962                              insn->tex_target,
2963                              insn->mem_format);
2964          } else {
2965             ureg_insn(c->ureg, insn->opcode,
2966                      insn->dst, opcode_info->num_dst,
2967                      insn->src, opcode_info->num_src,
2968                      insn->precise);
2969          }
2970       }
2971    }
2972 }
2973 
2974 static void
ntt_emit_if_ureg(struct ntt_compile * c,nir_if * if_stmt)2975 ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
2976 {
2977    /* Note: the last block emitted our IF opcode. */
2978 
2979    int if_stack = c->current_if_else;
2980    c->current_if_else = c->cf_label;
2981 
2982    /* Either the then or else block includes the ENDIF, which will fix up the
2983     * IF(/ELSE)'s label for jumping
2984     */
2985    ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
2986    ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
2987 
2988    c->current_if_else = if_stack;
2989 }
2990 
2991 static void
ntt_emit_cf_list_ureg(struct ntt_compile * c,struct exec_list * list)2992 ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
2993 {
2994    foreach_list_typed(nir_cf_node, node, node, list) {
2995       switch (node->type) {
2996       case nir_cf_node_block:
2997          ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
2998          break;
2999 
3000       case nir_cf_node_if:
3001          ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
3002          break;
3003 
3004       case nir_cf_node_loop:
3005          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
3006           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
3007           * with a need.
3008           */
3009          ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
3010          break;
3011 
3012       default:
3013          unreachable("unknown CF type");
3014       }
3015    }
3016 }
3017 
3018 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)3019 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3020 {
3021    c->impl = impl;
3022 
3023    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3024    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
3025 
3026    /* Set up the struct ntt_blocks to put insns in */
3027    c->blocks = _mesa_pointer_hash_table_create(c);
3028    nir_foreach_block(block, impl) {
3029       struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3030       util_dynarray_init(&ntt_block->insns, ntt_block);
3031       _mesa_hash_table_insert(c->blocks, block, ntt_block);
3032    }
3033 
3034 
3035    ntt_setup_registers(c, &impl->registers);
3036 
3037    c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3038    ntt_setup_inputs(c);
3039    ntt_setup_outputs(c);
3040    ntt_setup_uniforms(c);
3041 
3042    /* Emit the ntt insns */
3043    ntt_emit_cf_list(c, &impl->body);
3044 
3045    /* Don't do optimized RA if the driver requests it, unless the number of
3046     * temps is too large to be covered by the 16 bit signed int that TGSI
3047     * allocates for the register index */
3048    if (!c->options->unoptimized_ra || c->num_temps > 0x7fff)
3049       ntt_allocate_regs(c, impl);
3050    else
3051       ntt_allocate_regs_unoptimized(c, impl);
3052 
3053    /* Turn the ntt insns into actual TGSI tokens */
3054    ntt_emit_cf_list_ureg(c, &impl->body);
3055 
3056    ralloc_free(c->liveness);
3057    c->liveness = NULL;
3058 
3059 }
3060 
3061 static int
type_size(const struct glsl_type * type,bool bindless)3062 type_size(const struct glsl_type *type, bool bindless)
3063 {
3064    return glsl_count_attribute_slots(type, false);
3065 }
3066 
3067 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3068  * can handle for 64-bit values in TGSI.
3069  */
3070 static uint8_t
ntt_should_vectorize_instr(const nir_instr * instr,const void * data)3071 ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
3072 {
3073    if (instr->type != nir_instr_type_alu)
3074       return 0;
3075 
3076    nir_alu_instr *alu = nir_instr_as_alu(instr);
3077 
3078    switch (alu->op) {
3079    case nir_op_ibitfield_extract:
3080    case nir_op_ubitfield_extract:
3081    case nir_op_bitfield_insert:
3082       /* virglrenderer only looks at the .x channel of the offset/bits operands
3083        * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
3084        * offset/bits operands.
3085        *
3086        * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3087        */
3088       return 1;
3089 
3090    default:
3091       break;
3092    }
3093 
3094    int src_bit_size = nir_src_bit_size(alu->src[0].src);
3095    int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
3096 
3097    if (src_bit_size == 64 || dst_bit_size == 64) {
3098       /* Avoid vectorizing 64-bit instructions at all.  Despite tgsi.rst
3099        * claiming support, virglrenderer generates bad shaders on the host when
3100        * presented with them.  Maybe we can make virgl avoid tickling the
3101        * virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3102        * 64-bit instrs in the first place, I don't see much reason to care about
3103        * this.
3104        */
3105       return 1;
3106    }
3107 
3108    return 4;
3109 }
3110 
3111 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3112 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3113                         unsigned num_components, unsigned high_offset,
3114                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3115                         void *data)
3116 {
3117    if (bit_size != 32)
3118       return false;
3119 
3120    /* Our offset alignment should aways be at least 4 bytes */
3121    if (align < 4)
3122       return false;
3123 
3124    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
3125     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
3126     * need this restriction.
3127     */
3128    unsigned worst_start_component = align == 4 ? 3 : align / 4;
3129    if (worst_start_component + num_components > 4)
3130       return false;
3131 
3132    return true;
3133 }
3134 
3135 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)3136 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3137 {
3138    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3139    unsigned indirect_mask = 0;
3140 
3141    if (!screen->get_shader_param(screen, pipe_stage,
3142                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3143       indirect_mask |= nir_var_shader_in;
3144    }
3145 
3146    if (!screen->get_shader_param(screen, pipe_stage,
3147                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3148       indirect_mask |= nir_var_shader_out;
3149    }
3150 
3151    if (!screen->get_shader_param(screen, pipe_stage,
3152                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3153       indirect_mask |= nir_var_function_temp;
3154    }
3155 
3156    return indirect_mask;
3157 }
3158 
3159 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)3160 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
3161 {
3162    bool progress;
3163    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3164    unsigned control_flow_depth =
3165       screen->get_shader_param(screen, pipe_stage,
3166                                PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3167    do {
3168       progress = false;
3169 
3170       NIR_PASS_V(s, nir_lower_vars_to_ssa);
3171       NIR_PASS_V(s, nir_split_64bit_vec3_and_vec4);
3172 
3173       NIR_PASS(progress, s, nir_copy_prop);
3174       NIR_PASS(progress, s, nir_opt_algebraic);
3175       NIR_PASS(progress, s, nir_opt_constant_folding);
3176       NIR_PASS(progress, s, nir_opt_remove_phis);
3177       NIR_PASS(progress, s, nir_opt_conditional_discard);
3178       NIR_PASS(progress, s, nir_opt_dce);
3179       NIR_PASS(progress, s, nir_opt_dead_cf);
3180       NIR_PASS(progress, s, nir_opt_cse);
3181       NIR_PASS(progress, s, nir_opt_find_array_copies);
3182       NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3183       NIR_PASS(progress, s, nir_opt_dead_write_vars);
3184 
3185       NIR_PASS(progress, s, nir_opt_if, nir_opt_if_aggressive_last_continue | nir_opt_if_optimize_phi_true_false);
3186       NIR_PASS(progress, s, nir_opt_peephole_select,
3187                control_flow_depth == 0 ? ~0 : 8, true, true);
3188       NIR_PASS(progress, s, nir_opt_algebraic);
3189       NIR_PASS(progress, s, nir_opt_constant_folding);
3190       nir_load_store_vectorize_options vectorize_opts = {
3191          .modes = nir_var_mem_ubo,
3192          .callback = ntt_should_vectorize_io,
3193          .robust_modes = 0,
3194       };
3195       NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3196       NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3197       NIR_PASS(progress, s, nir_opt_shrink_vectors);
3198       NIR_PASS(progress, s, nir_opt_trivial_continues);
3199       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3200       NIR_PASS(progress, s, nir_opt_undef);
3201       NIR_PASS(progress, s, nir_opt_loop_unroll);
3202 
3203       /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3204        * and ALU ops for it.
3205        */
3206       static const nir_opt_offsets_options offset_options = {
3207          .ubo_vec4_max = ~0,
3208 
3209          /* No const offset in TGSI for shared accesses. */
3210          .shared_max = 0,
3211 
3212          /* unused intrinsics */
3213          .uniform_max = 0,
3214          .buffer_max = 0,
3215       };
3216       NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3217    } while (progress);
3218 
3219    NIR_PASS_V(s, nir_lower_var_copies);
3220 }
3221 
3222 /* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
3223  * scalarize vec3/vec4s, should probably fix that.
3224  */
3225 static bool
scalarize_64bit(const nir_instr * instr,const void * data)3226 scalarize_64bit(const nir_instr *instr, const void *data)
3227 {
3228    const nir_alu_instr *alu = nir_instr_as_alu(instr);
3229 
3230    return (nir_dest_bit_size(alu->dest.dest) == 64 ||
3231            nir_src_bit_size(alu->src[0].src) == 64);
3232 }
3233 
3234 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)3235 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3236 {
3237    b->cursor = nir_after_instr(&instr->instr);
3238 
3239    switch (instr->intrinsic) {
3240    case nir_intrinsic_load_ubo:
3241    case nir_intrinsic_load_ubo_vec4:
3242    case nir_intrinsic_load_ssbo:
3243    case nir_intrinsic_load_input:
3244    case nir_intrinsic_load_interpolated_input:
3245    case nir_intrinsic_load_per_vertex_input:
3246    case nir_intrinsic_store_output:
3247    case nir_intrinsic_store_per_vertex_output:
3248    case nir_intrinsic_store_ssbo:
3249       break;
3250    default:
3251       return false;
3252    }
3253 
3254    if (instr->num_components <= 2)
3255       return false;
3256 
3257    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3258    if (has_dest) {
3259       if (nir_dest_bit_size(instr->dest) != 64)
3260          return false;
3261    } else  {
3262       if (nir_src_bit_size(instr->src[0]) != 64)
3263           return false;
3264    }
3265 
3266    nir_intrinsic_instr *first =
3267       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3268    nir_intrinsic_instr *second =
3269       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3270 
3271    switch (instr->intrinsic) {
3272    case nir_intrinsic_load_ubo:
3273    case nir_intrinsic_load_ubo_vec4:
3274    case nir_intrinsic_load_ssbo:
3275    case nir_intrinsic_store_ssbo:
3276       break;
3277 
3278    default: {
3279       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3280       semantics.location++;
3281       semantics.num_slots--;
3282       nir_intrinsic_set_io_semantics(second, semantics);
3283 
3284       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3285       break;
3286    }
3287    }
3288 
3289    first->num_components = 2;
3290    second->num_components -= 2;
3291    if (has_dest) {
3292       first->dest.ssa.num_components = 2;
3293       second->dest.ssa.num_components -= 2;
3294    }
3295 
3296    nir_builder_instr_insert(b, &first->instr);
3297    nir_builder_instr_insert(b, &second->instr);
3298 
3299    if (has_dest) {
3300       /* Merge the two loads' results back into a vector. */
3301       nir_ssa_scalar channels[4] = {
3302          nir_get_ssa_scalar(&first->dest.ssa, 0),
3303          nir_get_ssa_scalar(&first->dest.ssa, 1),
3304          nir_get_ssa_scalar(&second->dest.ssa, 0),
3305          nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
3306       };
3307       nir_ssa_def *new = nir_vec_scalars(b, channels, instr->num_components);
3308       nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
3309    } else {
3310       /* Split the src value across the two stores. */
3311       b->cursor = nir_before_instr(&instr->instr);
3312 
3313       nir_ssa_def *src0 = instr->src[0].ssa;
3314       nir_ssa_scalar channels[4] = { 0 };
3315       for (int i = 0; i < instr->num_components; i++)
3316          channels[i] = nir_get_ssa_scalar(src0, i);
3317 
3318       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3319       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3320 
3321       nir_instr_rewrite_src(&first->instr, &first->src[0],
3322                             nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
3323       nir_instr_rewrite_src(&second->instr, &second->src[0],
3324                             nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
3325                                                            second->num_components)));
3326    }
3327 
3328    int offset_src = -1;
3329    uint32_t offset_amount = 16;
3330 
3331    switch (instr->intrinsic) {
3332    case nir_intrinsic_load_ssbo:
3333    case nir_intrinsic_load_ubo:
3334       offset_src = 1;
3335       break;
3336    case nir_intrinsic_load_ubo_vec4:
3337       offset_src = 1;
3338       offset_amount = 1;
3339       break;
3340    case nir_intrinsic_store_ssbo:
3341       offset_src = 2;
3342       break;
3343    default:
3344       break;
3345    }
3346    if (offset_src != -1) {
3347       b->cursor = nir_before_instr(&second->instr);
3348       nir_ssa_def *second_offset =
3349          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3350       nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
3351                             nir_src_for_ssa(second_offset));
3352    }
3353 
3354    /* DCE stores we generated with no writemask (nothing else does this
3355     * currently).
3356     */
3357    if (!has_dest) {
3358       if (nir_intrinsic_write_mask(first) == 0)
3359          nir_instr_remove(&first->instr);
3360       if (nir_intrinsic_write_mask(second) == 0)
3361          nir_instr_remove(&second->instr);
3362    }
3363 
3364    nir_instr_remove(&instr->instr);
3365 
3366    return true;
3367 }
3368 
3369 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)3370 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3371 {
3372    int num_components = instr->def.num_components;
3373 
3374    if (instr->def.bit_size != 64 || num_components <= 2)
3375       return false;
3376 
3377    b->cursor = nir_before_instr(&instr->instr);
3378 
3379    nir_load_const_instr *first =
3380       nir_load_const_instr_create(b->shader, 2, 64);
3381    nir_load_const_instr *second =
3382       nir_load_const_instr_create(b->shader, num_components - 2, 64);
3383 
3384    first->value[0] = instr->value[0];
3385    first->value[1] = instr->value[1];
3386    second->value[0] = instr->value[2];
3387    if (num_components == 4)
3388       second->value[1] = instr->value[3];
3389 
3390    nir_builder_instr_insert(b, &first->instr);
3391    nir_builder_instr_insert(b, &second->instr);
3392 
3393    nir_ssa_def *channels[4] = {
3394       nir_channel(b, &first->def, 0),
3395       nir_channel(b, &first->def, 1),
3396       nir_channel(b, &second->def, 0),
3397       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3398    };
3399    nir_ssa_def *new = nir_vec(b, channels, num_components);
3400    nir_ssa_def_rewrite_uses(&instr->def, new);
3401    nir_instr_remove(&instr->instr);
3402 
3403    return true;
3404 }
3405 
3406 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)3407 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3408                                       void *data)
3409 {
3410    switch (instr->type) {
3411    case nir_instr_type_load_const:
3412       return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3413 
3414    case nir_instr_type_intrinsic:
3415       return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3416    default:
3417       return false;
3418    }
3419 }
3420 
3421 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)3422 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3423 {
3424    return nir_shader_instructions_pass(s,
3425                                        nir_to_tgsi_lower_64bit_to_vec2_instr,
3426                                        nir_metadata_block_index |
3427                                        nir_metadata_dominance,
3428                                        NULL);
3429 }
3430 
3431 struct ntt_lower_tex_state {
3432    nir_ssa_scalar channels[8];
3433    unsigned i;
3434 };
3435 
3436 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)3437 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3438                                 nir_tex_instr *instr,
3439                                 nir_tex_src_type tex_src_type,
3440                                 struct ntt_lower_tex_state *s)
3441 {
3442    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3443    if (tex_src < 0)
3444       return;
3445 
3446    assert(instr->src[tex_src].src.is_ssa);
3447 
3448    nir_ssa_def *def = instr->src[tex_src].src.ssa;
3449    for (int i = 0; i < def->num_components; i++) {
3450       s->channels[s->i++] = nir_get_ssa_scalar(def, i);
3451    }
3452 
3453    nir_tex_instr_remove_src(instr, tex_src);
3454 }
3455 
3456 /**
3457  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3458  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
3459  * manage it on our own, and may lead to more vectorization.
3460  */
3461 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)3462 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3463 {
3464    if (instr->type != nir_instr_type_tex)
3465       return false;
3466 
3467    nir_tex_instr *tex = nir_instr_as_tex(instr);
3468 
3469    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3470       return false;
3471 
3472    b->cursor = nir_before_instr(instr);
3473 
3474    struct ntt_lower_tex_state s = {0};
3475 
3476    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3477    /* We always have at least two slots for the coordinate, even on 1D. */
3478    s.i = MAX2(s.i, 2);
3479 
3480    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3481    s.i = MAX2(s.i, 3);
3482 
3483    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3484 
3485    /* XXX: LZ */
3486    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3487    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3488    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3489 
3490    /* No need to pack undefs in unused channels of the tex instr */
3491    while (!s.channels[s.i - 1].def)
3492       s.i--;
3493 
3494    /* Instead of putting undefs in the unused slots of the vecs, just put in
3495     * another used channel.  Otherwise, we'll get unnecessary moves into
3496     * registers.
3497     */
3498    assert(s.channels[0].def != NULL);
3499    for (int i = 1; i < s.i; i++) {
3500       if (!s.channels[i].def)
3501          s.channels[i] = s.channels[0];
3502    }
3503 
3504    nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec_scalars(b, s.channels, MIN2(s.i, 4))));
3505    if (s.i > 4)
3506       nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec_scalars(b, &s.channels[4], s.i - 4)));
3507 
3508    return true;
3509 }
3510 
3511 static bool
nir_to_tgsi_lower_tex(nir_shader * s)3512 nir_to_tgsi_lower_tex(nir_shader *s)
3513 {
3514    return nir_shader_instructions_pass(s,
3515                                        nir_to_tgsi_lower_tex_instr,
3516                                        nir_metadata_block_index |
3517                                        nir_metadata_dominance,
3518                                        NULL);
3519 }
3520 
3521 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s,const struct nir_to_tgsi_options * ntt_options)3522 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3523                     const struct nir_to_tgsi_options *ntt_options)
3524 {
3525    const struct nir_shader_compiler_options *options = s->options;
3526    bool lower_fsqrt =
3527       !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3528                                 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3529 
3530    bool force_indirect_unrolling_sampler =
3531       screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL) < 400;
3532 
3533    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3534 
3535    if (!options->lower_extract_byte ||
3536        !options->lower_extract_word ||
3537        !options->lower_insert_byte ||
3538        !options->lower_insert_word ||
3539        !options->lower_fdph ||
3540        !options->lower_flrp64 ||
3541        !options->lower_fmod ||
3542        !options->lower_rotate ||
3543        !options->lower_uadd_sat ||
3544        !options->lower_usub_sat ||
3545        !options->lower_uniforms_to_ubo ||
3546        !options->lower_vector_cmp ||
3547        options->lower_fsqrt != lower_fsqrt ||
3548        options->force_indirect_unrolling != no_indirects_mask ||
3549        force_indirect_unrolling_sampler) {
3550       nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3551       *new_options = *s->options;
3552 
3553       new_options->lower_extract_byte = true;
3554       new_options->lower_extract_word = true;
3555       new_options->lower_insert_byte = true;
3556       new_options->lower_insert_word = true;
3557       new_options->lower_fdph = true;
3558       new_options->lower_flrp64 = true;
3559       new_options->lower_fmod = true;
3560       new_options->lower_rotate = true;
3561       new_options->lower_uadd_sat = true;
3562       new_options->lower_usub_sat = true;
3563       new_options->lower_uniforms_to_ubo = true;
3564       new_options->lower_vector_cmp = true;
3565       new_options->lower_fsqrt = lower_fsqrt;
3566       new_options->force_indirect_unrolling = no_indirects_mask;
3567       new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;
3568 
3569       s->options = new_options;
3570    }
3571 }
3572 
3573 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)3574 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3575 {
3576    return (instr->type == nir_instr_type_intrinsic &&
3577            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3578 }
3579 
3580 static nir_ssa_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)3581 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3582 {
3583    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3584 
3585    nir_ssa_def *old_result = &intr->dest.ssa;
3586    intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3587 
3588    return nir_iadd_imm(b, old_result, -1);
3589 }
3590 
3591 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)3592 ntt_lower_atomic_pre_dec(nir_shader *s)
3593 {
3594    return nir_shader_lower_instructions(s,
3595                                         ntt_lower_atomic_pre_dec_filter,
3596                                         ntt_lower_atomic_pre_dec_lower, NULL);
3597 }
3598 
3599 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3600 static void
nir_to_tgsi_lower_txp(nir_shader * s)3601 nir_to_tgsi_lower_txp(nir_shader *s)
3602 {
3603    nir_lower_tex_options lower_tex_options = {
3604        .lower_txp = 0,
3605    };
3606 
3607    nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3608       nir_foreach_instr(instr, block) {
3609          if (instr->type != nir_instr_type_tex)
3610             continue;
3611          nir_tex_instr *tex = nir_instr_as_tex(instr);
3612 
3613          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3614             continue;
3615 
3616          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3617          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3618          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3619 
3620          /* We can do TXP for any tex (not txg) where we can fit all the
3621           * coordinates and comparator and projector in one vec4 without any
3622           * other modifiers to add on.
3623           *
3624           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3625           * if we get any funny projectors then we just blow them all away.
3626           */
3627          if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3628             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3629       }
3630    }
3631 
3632    /* nir_lower_tex must be run even if no options are set, because we need the
3633     * LOD to be set for query_levels and for non-fragment shaders.
3634     */
3635    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3636 }
3637 
3638 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3639 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3640 {
3641    return (instr->type == nir_instr_type_intrinsic &&
3642            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3643 }
3644 
3645 static nir_ssa_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3646 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3647 {
3648    nir_variable *var = *(nir_variable **)data;
3649    if (!var) {
3650       var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3651       var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3652       b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3653       var->data.driver_location = b->shader->num_inputs++;
3654 
3655       *(nir_variable **)data = var;
3656    }
3657 
3658    nir_io_semantics semantics = {
3659       .location = var->data.location,
3660        .num_slots = 1
3661    };
3662    return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3663                          .base = var->data.driver_location,
3664                          .io_semantics = semantics);
3665 }
3666 
3667 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3668 nir_lower_primid_sysval_to_input(nir_shader *s)
3669 {
3670    nir_variable *input = NULL;
3671 
3672    return nir_shader_lower_instructions(s,
3673                                         nir_lower_primid_sysval_to_input_filter,
3674                                         nir_lower_primid_sysval_to_input_lower, &input);
3675 }
3676 
3677 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3678 nir_to_tgsi(struct nir_shader *s,
3679             struct pipe_screen *screen)
3680 {
3681    static const struct nir_to_tgsi_options default_ntt_options = {0};
3682    return nir_to_tgsi_options(s, screen, &default_ntt_options);
3683 }
3684 
3685 /* Prevent lower_vec_to_mov from coalescing 64-to-32 conversions and comparisons
3686  * into unsupported channels of registers.
3687  */
3688 static bool
ntt_vec_to_mov_writemask_cb(const nir_instr * instr,unsigned writemask,UNUSED const void * _data)3689 ntt_vec_to_mov_writemask_cb(const nir_instr *instr, unsigned writemask, UNUSED const void *_data)
3690 {
3691    if (instr->type != nir_instr_type_alu)
3692       return false;
3693 
3694    nir_alu_instr *alu = nir_instr_as_alu(instr);
3695    int dst_32 = nir_dest_bit_size(alu->dest.dest) == 32;
3696    int src_64 = nir_src_bit_size(alu->src[0].src) == 64;
3697 
3698    if (src_64 && dst_32) {
3699       int num_srcs = nir_op_infos[alu->op].num_inputs;
3700 
3701       if (num_srcs == 2 || nir_op_infos[alu->op].output_type == nir_type_bool32) {
3702          /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz
3703           * instead of .xy.  Just support scalar compares storing to .x,
3704           * GLSL-to-TGSI only ever emitted scalar ops anyway.
3705           */
3706         if (writemask != TGSI_WRITEMASK_X)
3707            return false;
3708       } else {
3709          /* TGSI's 64-to-32-bit conversions can only store to .xy (since a TGSI
3710           * register can only store a dvec2).  Don't try to coalesce to write to
3711           * .zw.
3712           */
3713          if (writemask & ~(TGSI_WRITEMASK_XY))
3714             return false;
3715       }
3716    }
3717 
3718    return true;
3719 }
3720 
3721 /**
3722  * Translates the NIR shader to TGSI.
3723  *
3724  * This requires some lowering of the NIR shader to prepare it for translation.
3725  * We take ownership of the NIR shader passed, returning a reference to the new
3726  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
3727  */
nir_to_tgsi_options(struct nir_shader * s,struct pipe_screen * screen,const struct nir_to_tgsi_options * options)3728 const void *nir_to_tgsi_options(struct nir_shader *s,
3729                                 struct pipe_screen *screen,
3730                                 const struct nir_to_tgsi_options *options)
3731 {
3732    struct ntt_compile *c;
3733    const void *tgsi_tokens;
3734    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3735    bool native_integers = screen->get_shader_param(screen,
3736                                                    pipe_shader_type_from_mesa(s->info.stage),
3737                                                    PIPE_SHADER_CAP_INTEGERS);
3738    const struct nir_shader_compiler_options *original_options = s->options;
3739 
3740    ntt_fix_nir_options(screen, s, options);
3741 
3742    /* Lower array indexing on FS inputs.  Since we don't set
3743     * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
3744     * elements by ureg, and so dynamically indexing them would be invalid.
3745     * Ideally we would set that ureg flag based on
3746     * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
3747     * splitting NIR VS outputs to elements even if the FS doesn't get the
3748     * corresponding splitting, and virgl depends on TGSI across link boundaries
3749     * having matching declarations.
3750     */
3751    if (s->info.stage == MESA_SHADER_FRAGMENT) {
3752       NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
3753       NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
3754    }
3755 
3756    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3757               type_size, (nir_lower_io_options)0);
3758    NIR_PASS_V(s, nir_lower_regs_to_ssa);
3759 
3760    nir_to_tgsi_lower_txp(s);
3761    NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3762 
3763    /* While TGSI can represent PRIMID as either an input or a system value,
3764     * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3765     * depend on that.
3766     */
3767    if (s->info.stage == MESA_SHADER_GEOMETRY)
3768       NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3769 
3770    if (s->info.num_abos)
3771       NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3772 
3773    if (!original_options->lower_uniforms_to_ubo) {
3774       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3775                  screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3776                  !native_integers);
3777    }
3778 
3779    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3780     * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3781     * duplication logic we just make it so that we only see vec2s.
3782     */
3783    NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3784    NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3785 
3786    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3787       NIR_PASS_V(s, nir_lower_ubo_vec4);
3788 
3789    ntt_optimize_nir(s, screen);
3790 
3791    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3792 
3793    bool progress;
3794    do {
3795       progress = false;
3796       NIR_PASS(progress, s, nir_opt_algebraic_late);
3797       if (progress) {
3798          NIR_PASS_V(s, nir_copy_prop);
3799          NIR_PASS_V(s, nir_opt_dce);
3800          NIR_PASS_V(s, nir_opt_cse);
3801       }
3802    } while (progress);
3803 
3804    if (screen->get_shader_param(screen,
3805                                 pipe_shader_type_from_mesa(s->info.stage),
3806                                 PIPE_SHADER_CAP_INTEGERS)) {
3807       NIR_PASS_V(s, nir_lower_bool_to_int32);
3808    } else {
3809       NIR_PASS_V(s, nir_lower_int_to_float);
3810       NIR_PASS_V(s, nir_lower_bool_to_float);
3811       /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3812       NIR_PASS_V(s, nir_copy_prop);
3813       NIR_PASS_V(s, nir_opt_dce);
3814    }
3815 
3816    nir_move_options move_all =
3817        nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3818        nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3819 
3820    NIR_PASS_V(s, nir_opt_move, move_all);
3821 
3822    /* Only lower 32-bit floats.  The only other modifier type officially
3823     * supported by TGSI is 32-bit integer negates, but even those are broken on
3824     * virglrenderer, so skip lowering all integer and f64 float mods.
3825     *
3826     * The options->lower_fabs requests that we not have native source modifiers
3827     * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
3828     */
3829    nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods;
3830    if (!options->lower_fabs)
3831       source_mods |= nir_lower_fabs_source_mods;
3832    NIR_PASS_V(s, nir_lower_to_source_mods, source_mods);
3833 
3834    NIR_PASS_V(s, nir_convert_from_ssa, true);
3835    NIR_PASS_V(s, nir_lower_vec_to_movs, ntt_vec_to_mov_writemask_cb, NULL);
3836 
3837    /* locals_to_regs will leave dead derefs that are good to clean up. */
3838    NIR_PASS_V(s, nir_lower_locals_to_regs);
3839    NIR_PASS_V(s, nir_opt_dce);
3840 
3841    if (NIR_DEBUG(TGSI)) {
3842       fprintf(stderr, "NIR before translation to TGSI:\n");
3843       nir_print_shader(s, stderr);
3844    }
3845 
3846    c = rzalloc(NULL, struct ntt_compile);
3847    c->screen = screen;
3848    c->options = options;
3849 
3850    c->needs_texcoord_semantic =
3851       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3852    c->has_txf_lz =
3853       screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3854 
3855    c->s = s;
3856    c->native_integers = native_integers;
3857    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3858    ureg_setup_shader_info(c->ureg, &s->info);
3859    if (s->info.use_legacy_math_rules && screen->get_param(screen, PIPE_CAP_LEGACY_MATH_RULES))
3860       ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
3861 
3862    if (s->info.stage == MESA_SHADER_FRAGMENT) {
3863       /* The draw module's polygon stipple layer doesn't respect the chosen
3864        * coordinate mode, so leave it as unspecified unless we're actually
3865        * reading the position in the shader already.  See
3866        * gl-2.1-polygon-stipple-fs on softpipe.
3867        */
3868       if ((s->info.inputs_read & VARYING_BIT_POS) ||
3869           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3870          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3871                        s->info.fs.origin_upper_left ?
3872                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3873                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3874 
3875          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3876                        s->info.fs.pixel_center_integer ?
3877                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3878                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3879       }
3880    }
3881    /* Emit the main function */
3882    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3883    ntt_emit_impl(c, impl);
3884    ureg_END(c->ureg);
3885 
3886    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3887 
3888    if (NIR_DEBUG(TGSI)) {
3889       fprintf(stderr, "TGSI after translation from NIR:\n");
3890       tgsi_dump(tgsi_tokens, 0);
3891    }
3892 
3893    ureg_destroy(c->ureg);
3894 
3895    ralloc_free(c);
3896    ralloc_free(s);
3897 
3898    return tgsi_tokens;
3899 }
3900 
3901 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3902    .fdot_replicates = true,
3903    .fuse_ffma32 = true,
3904    .fuse_ffma64 = true,
3905    .lower_extract_byte = true,
3906    .lower_extract_word = true,
3907    .lower_insert_byte = true,
3908    .lower_insert_word = true,
3909    .lower_fdph = true,
3910    .lower_flrp64 = true,
3911    .lower_fmod = true,
3912    .lower_rotate = true,
3913    .lower_uniforms_to_ubo = true,
3914    .lower_uadd_sat = true,
3915    .lower_usub_sat = true,
3916    .lower_vector_cmp = true,
3917    .lower_int64_options = nir_lower_imul_2x32_64,
3918    .use_interpolated_input_intrinsics = true,
3919 };
3920 
3921 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
3922  * NIR support.
3923  */
3924 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)3925 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3926                                  enum pipe_shader_ir ir,
3927                                  unsigned shader)
3928 {
3929    assert(ir == PIPE_SHADER_IR_NIR);
3930    return &nir_to_tgsi_compiler_options;
3931 }
3932 
3933 /** Helper for getting TGSI tokens to store for a pipe_shader_state CSO. */
3934 const void *
pipe_shader_state_to_tgsi_tokens(struct pipe_screen * screen,const struct pipe_shader_state * cso)3935 pipe_shader_state_to_tgsi_tokens(struct pipe_screen *screen,
3936                                  const struct pipe_shader_state *cso)
3937 {
3938    if (cso->type == PIPE_SHADER_IR_NIR) {
3939       return nir_to_tgsi((nir_shader *)cso->ir.nir, screen);
3940    } else {
3941       assert(cso->type == PIPE_SHADER_IR_TGSI);
3942       /* we need to keep a local copy of the tokens */
3943       return tgsi_dup_tokens(cso->tokens);
3944    }
3945 }
3946