• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014-2015 Broadcom
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nir_to_rc.h"
7 #include "compiler/nir/nir.h"
8 #include "compiler/nir/nir_deref.h"
9 #include "compiler/nir/nir_legacy.h"
10 #include "compiler/nir/nir_worklist.h"
11 #include "compiler/radeon_code.h"
12 #include "compiler/radeon_program_constants.h"
13 #include "r300_nir.h"
14 #include "r300_screen.h"
15 #include "pipe/p_screen.h"
16 #include "pipe/p_state.h"
17 #include "tgsi/tgsi_dump.h"
18 #include "tgsi/tgsi_from_mesa.h"
19 #include "tgsi/tgsi_info.h"
20 #include "tgsi/tgsi_parse.h"
21 #include "tgsi/tgsi_ureg.h"
22 #include "tgsi/tgsi_util.h"
23 #include "util/u_debug.h"
24 #include "util/u_dynarray.h"
25 #include "util/u_math.h"
26 #include "util/u_memory.h"
27 #include "r300_nir.h"
28 #include "r300_screen.h"
29 
30 struct ntr_insn {
31    enum tgsi_opcode opcode;
32    struct ureg_dst dst[2];
33    struct ureg_src src[4];
34    enum tgsi_texture_type tex_target;
35    enum tgsi_return_type tex_return_type;
36    struct tgsi_texture_offset tex_offset[4];
37 
38    unsigned mem_qualifier;
39    enum pipe_format mem_format;
40 
41    bool is_tex : 1;
42    bool precise : 1;
43 };
44 
45 struct ntr_block {
46    /* Array of struct ntr_insn */
47    struct util_dynarray insns;
48    int start_ip;
49    int end_ip;
50 };
51 
52 struct ntr_reg_interval {
53    uint32_t start, end;
54 };
55 
56 struct ntr_compile {
57    nir_shader *s;
58    nir_function_impl *impl;
59    struct pipe_screen *screen;
60    struct ureg_program *ureg;
61 
62    /* Options */
63    bool lower_fabs;
64 
65    bool addr_declared[3];
66    struct ureg_dst addr_reg[3];
67 
68    /* if condition set up at the end of a block, for ntr_emit_if(). */
69    struct ureg_src if_cond;
70 
71    /* TGSI temps for our NIR SSA and register values. */
72    struct ureg_dst *reg_temp;
73    struct ureg_src *ssa_temp;
74 
75    struct ntr_reg_interval *liveness;
76 
77    /* Map from nir_block to ntr_block */
78    struct hash_table *blocks;
79    struct ntr_block *cur_block;
80    unsigned current_if_else;
81    unsigned cf_label;
82 
83    /* Whether we're currently emitting instructiosn for a precise NIR instruction. */
84    bool precise;
85 
86    unsigned num_temps;
87 
88    /* Mappings from driver_location to TGSI input/output number.
89     *
90     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
91     * their numbers assigned incrementally, unlike inputs or constants.
92     */
93    struct ureg_src *input_index_map;
94    uint64_t centroid_inputs;
95 
96    uint32_t first_ubo;
97 };
98 
99 static struct ureg_dst
ntr_temp(struct ntr_compile * c)100 ntr_temp(struct ntr_compile *c)
101 {
102    return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
103 }
104 
105 static struct ntr_block *
ntr_block_from_nir(struct ntr_compile * c,struct nir_block * block)106 ntr_block_from_nir(struct ntr_compile *c, struct nir_block *block)
107 {
108    struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
109    return entry->data;
110 }
111 
112 static void ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list);
113 static void ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list);
114 
115 static struct ntr_insn *
ntr_insn(struct ntr_compile * c,enum tgsi_opcode opcode,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1,struct ureg_src src2,struct ureg_src src3)116 ntr_insn(struct ntr_compile *c, enum tgsi_opcode opcode, struct ureg_dst dst, struct ureg_src src0,
117          struct ureg_src src1, struct ureg_src src2, struct ureg_src src3)
118 {
119    struct ntr_insn insn = {
120       .opcode = opcode,
121       .dst = {dst, ureg_dst_undef()},
122       .src = {src0, src1, src2, src3},
123       .precise = c->precise,
124    };
125    util_dynarray_append(&c->cur_block->insns, struct ntr_insn, insn);
126    return util_dynarray_top_ptr(&c->cur_block->insns, struct ntr_insn);
127 }
128 
129 #define OP00(op)                                                                                   \
130    static inline void ntr_##op(struct ntr_compile *c)                                              \
131    {                                                                                               \
132       ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(),          \
133                ureg_src_undef(), ureg_src_undef());                                                \
134    }
135 
136 #define OP01(op)                                                                                   \
137    static inline void ntr_##op(struct ntr_compile *c, struct ureg_src src0)                        \
138    {                                                                                               \
139       ntr_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(),    \
140                ureg_src_undef());                                                                  \
141    }
142 
143 #define OP10(op)                                                                                   \
144    static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst)                         \
145    {                                                                                               \
146       ntr_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(),     \
147                ureg_src_undef());                                                                  \
148    }
149 
150 #define OP11(op)                                                                                   \
151    static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0)   \
152    {                                                                                               \
153       ntr_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(),                 \
154                ureg_src_undef());                                                                  \
155    }
156 
157 #define OP12(op)                                                                                   \
158    static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
159                                struct ureg_src src1)                                               \
160    {                                                                                               \
161       ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef());          \
162    }
163 
164 #define OP13(op)                                                                                   \
165    static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
166                                struct ureg_src src1, struct ureg_src src2)                         \
167    {                                                                                               \
168       ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef());                      \
169    }
170 
171 #define OP14(op)                                                                                   \
172    static inline void ntr_##op(struct ntr_compile *c, struct ureg_dst dst, struct ureg_src src0,   \
173                                struct ureg_src src1, struct ureg_src src2, struct ureg_src src3)   \
174    {                                                                                               \
175       ntr_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3);                                  \
176    }
177 
178 /* We hand-craft our tex instructions */
179 #define OP12_TEX(op)
180 #define OP14_TEX(op)
181 
182 /* Use a template include to generate a correctly-typed ntr_OP()
183  * function for each TGSI opcode:
184  */
185 #include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
186 
187 /**
188  * Interprets a nir_load_const used as a NIR src as a uint.
189  *
190  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
191  * instruction (or in a phi-web used by an integer ALU instruction) were
192  * converted to floats and the ALU instruction swapped to the float equivalent.
193  * However, this means that integer load_consts used by intrinsics (which don't
194  * normally get that conversion) may have been reformatted to be floats.  Given
195  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
196  * we can just look and see if they look like floats and convert them back to
197  * ints.
198  */
199 static uint32_t
ntr_src_as_uint(struct ntr_compile * c,nir_src src)200 ntr_src_as_uint(struct ntr_compile *c, nir_src src)
201 {
202    uint32_t val = nir_src_as_uint(src);
203    if (val >= fui(1.0))
204       val = (uint32_t)uif(val);
205    return val;
206 }
207 
208 /* Per-channel masks of def/use within the block, and the per-channel
209  * livein/liveout for the block as a whole.
210  */
211 struct ntr_live_reg_block_state {
212    uint8_t *def, *use, *livein, *liveout, *defin, *defout;
213 };
214 
215 struct ntr_live_reg_state {
216    unsigned bitset_words;
217 
218    struct ntr_reg_interval *regs;
219 
220    /* Used in propagate_across_edge() */
221    BITSET_WORD *tmp_live;
222 
223    struct ntr_live_reg_block_state *blocks;
224 
225    nir_block_worklist worklist;
226 };
227 
228 static void
ntr_allocate_regs_unoptimized(struct ntr_compile * c,nir_function_impl * impl)229 ntr_allocate_regs_unoptimized(struct ntr_compile *c, nir_function_impl *impl)
230 {
231    for (int i = 0; i < c->num_temps; i++)
232       ureg_DECL_temporary(c->ureg);
233 }
234 
235 /* TGSI varying declarations have a component usage mask associated (used by
236  * r600 and svga).
237  */
238 static uint32_t
ntr_tgsi_var_usage_mask(const struct nir_variable * var)239 ntr_tgsi_var_usage_mask(const struct nir_variable *var)
240 {
241    const struct glsl_type *type_without_array = glsl_without_array(var->type);
242    unsigned num_components = glsl_get_vector_elements(type_without_array);
243    if (num_components == 0) /* structs */
244       num_components = 4;
245 
246    return u_bit_consecutive(var->data.location_frac, num_components);
247 }
248 
249 static struct ureg_dst
ntr_output_decl(struct ntr_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)250 ntr_output_decl(struct ntr_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
251 {
252    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
253    int base = nir_intrinsic_base(instr);
254    *frac = nir_intrinsic_component(instr);
255 
256    struct ureg_dst out;
257    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
258       unsigned semantic_name, semantic_index;
259       tgsi_get_gl_frag_result_semantic(semantics.location, &semantic_name, &semantic_index);
260       semantic_index += semantics.dual_source_blend_index;
261 
262       switch (semantics.location) {
263       case FRAG_RESULT_DEPTH:
264          *frac = 2; /* z write is the to the .z channel in TGSI */
265          break;
266       case FRAG_RESULT_STENCIL:
267          *frac = 1;
268          break;
269       default:
270          break;
271       }
272 
273       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
274    } else {
275       unsigned semantic_name, semantic_index;
276 
277       tgsi_get_gl_varying_semantic(semantics.location, true, &semantic_name, &semantic_index);
278 
279       uint32_t usage_mask = u_bit_consecutive(*frac, instr->num_components);
280       uint32_t gs_streams = semantics.gs_streams;
281       for (int i = 0; i < 4; i++) {
282          if (!(usage_mask & (1 << i)))
283             gs_streams &= ~(0x3 << 2 * i);
284       }
285 
286       /* No driver appears to use array_id of outputs. */
287       unsigned array_id = 0;
288 
289       /* This bit is lost in the i/o semantics, but it's unused in in-tree
290        * drivers.
291        */
292       bool invariant = semantics.invariant;
293 
294       out = ureg_DECL_output_layout(c->ureg, semantic_name, semantic_index, gs_streams, base,
295                                     usage_mask, array_id, semantics.num_slots, invariant);
296    }
297 
298    unsigned write_mask;
299    if (nir_intrinsic_has_write_mask(instr))
300       write_mask = nir_intrinsic_write_mask(instr);
301    else
302       write_mask = ((1 << instr->num_components) - 1) << *frac;
303 
304    write_mask = write_mask << *frac;
305    return ureg_writemask(out, write_mask);
306 }
307 
308 static bool
ntr_try_store_in_tgsi_output_with_use(struct ntr_compile * c,struct ureg_dst * dst,nir_src * src)309 ntr_try_store_in_tgsi_output_with_use(struct ntr_compile *c, struct ureg_dst *dst, nir_src *src)
310 {
311    *dst = ureg_dst_undef();
312 
313    if (nir_src_is_if(src))
314       return false;
315 
316    if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
317       return false;
318 
319    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(nir_src_parent_instr(src));
320    if (intr->intrinsic != nir_intrinsic_store_output || !nir_src_is_const(intr->src[1])) {
321       return false;
322    }
323 
324    uint32_t frac;
325    *dst = ntr_output_decl(c, intr, &frac);
326    dst->Index += ntr_src_as_uint(c, intr->src[1]);
327 
328    return frac == 0;
329 }
330 
331 /* If this reg is used only for storing an output, then in the simple
332  * cases we can write directly to the TGSI output instead of having
333  * store_output emit its own MOV.
334  */
335 static bool
ntr_try_store_reg_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_intrinsic_instr * reg_decl)336 ntr_try_store_reg_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst,
337                                  nir_intrinsic_instr *reg_decl)
338 {
339    assert(reg_decl->intrinsic == nir_intrinsic_decl_reg);
340 
341    *dst = ureg_dst_undef();
342 
343    /* Look for a single use for try_store_in_tgsi_output */
344    nir_src *use = NULL;
345    nir_foreach_reg_load (src, reg_decl) {
346       nir_intrinsic_instr *load = nir_instr_as_intrinsic(nir_src_parent_instr(src));
347       nir_foreach_use_including_if (load_use, &load->def) {
348          /* We can only have one use */
349          if (use != NULL)
350             return false;
351 
352          use = load_use;
353       }
354    }
355 
356    if (use == NULL)
357       return false;
358 
359    return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
360 }
361 
362 /* If this SSA def is used only for storing an output, then in the simple
363  * cases we can write directly to the TGSI output instead of having
364  * store_output emit its own MOV.
365  */
366 static bool
ntr_try_store_ssa_in_tgsi_output(struct ntr_compile * c,struct ureg_dst * dst,nir_def * def)367 ntr_try_store_ssa_in_tgsi_output(struct ntr_compile *c, struct ureg_dst *dst, nir_def *def)
368 {
369    *dst = ureg_dst_undef();
370 
371    if (!list_is_singular(&def->uses))
372       return false;
373 
374    nir_foreach_use_including_if (use, def) {
375       return ntr_try_store_in_tgsi_output_with_use(c, dst, use);
376    }
377    unreachable("We have one use");
378 }
379 
380 static void
ntr_setup_inputs(struct ntr_compile * c)381 ntr_setup_inputs(struct ntr_compile *c)
382 {
383    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
384       return;
385 
386    unsigned num_inputs = 0;
387    int num_input_arrays = 0;
388 
389    nir_foreach_shader_in_variable (var, c->s) {
390       const struct glsl_type *type = var->type;
391       unsigned array_len = glsl_count_attribute_slots(type, false);
392 
393       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
394    }
395 
396    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
397 
398    nir_foreach_shader_in_variable (var, c->s) {
399       const struct glsl_type *type = var->type;
400       unsigned array_len = glsl_count_attribute_slots(type, false);
401 
402       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
403       unsigned sample_loc;
404       struct ureg_src decl;
405 
406       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
407          interpolation = tgsi_get_interp_mode(
408             var->data.interpolation,
409             var->data.location == VARYING_SLOT_COL0 || var->data.location == VARYING_SLOT_COL1);
410 
411          if (var->data.location == VARYING_SLOT_POS)
412             interpolation = TGSI_INTERPOLATE_LINEAR;
413       }
414 
415       unsigned semantic_name, semantic_index;
416       tgsi_get_gl_varying_semantic(var->data.location, true, &semantic_name, &semantic_index);
417 
418       if (var->data.sample) {
419          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
420       } else if (var->data.centroid) {
421          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
422          c->centroid_inputs |= (BITSET_MASK(array_len) << var->data.driver_location);
423       } else {
424          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
425       }
426 
427       unsigned array_id = 0;
428       if (glsl_type_is_array(type))
429          array_id = ++num_input_arrays;
430 
431       uint32_t usage_mask = ntr_tgsi_var_usage_mask(var);
432 
433       decl = ureg_DECL_fs_input_centroid_layout(
434          c->ureg, semantic_name, semantic_index, interpolation, sample_loc,
435          var->data.driver_location, usage_mask, array_id, array_len);
436 
437       if (semantic_name == TGSI_SEMANTIC_FACE) {
438          struct ureg_dst temp = ntr_temp(c);
439          /* tgsi docs say that floating point FACE will be positive for
440           * frontface and negative for backface, but realistically
441           * GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
442           * Copy that behavior, since some drivers (r300) have been doing a
443           * 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
444           * front face).
445           */
446          temp.Saturate = true;
447          ntr_MOV(c, temp, decl);
448          decl = ureg_src(temp);
449       }
450 
451       for (unsigned i = 0; i < array_len; i++) {
452          c->input_index_map[var->data.driver_location + i] = decl;
453          c->input_index_map[var->data.driver_location + i].Index += i;
454       }
455    }
456 }
457 
458 static int
ntr_sort_by_location(const nir_variable * a,const nir_variable * b)459 ntr_sort_by_location(const nir_variable *a, const nir_variable *b)
460 {
461    return a->data.location - b->data.location;
462 }
463 
464 /**
465  * Workaround for virglrenderer requiring that TGSI FS output color variables
466  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
467  */
468 static void
ntr_setup_outputs(struct ntr_compile * c)469 ntr_setup_outputs(struct ntr_compile *c)
470 {
471    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
472       return;
473 
474    nir_sort_variables_with_modes(c->s, ntr_sort_by_location, nir_var_shader_out);
475 
476    nir_foreach_shader_out_variable (var, c->s) {
477       if (var->data.location == FRAG_RESULT_COLOR)
478          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
479 
480       unsigned semantic_name, semantic_index;
481       tgsi_get_gl_frag_result_semantic(var->data.location, &semantic_name, &semantic_index);
482 
483       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
484    }
485 }
486 
487 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array)488 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
489 {
490    switch (dim) {
491    case GLSL_SAMPLER_DIM_1D:
492       return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
493    case GLSL_SAMPLER_DIM_2D:
494    case GLSL_SAMPLER_DIM_EXTERNAL:
495       return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
496    case GLSL_SAMPLER_DIM_3D:
497       return TGSI_TEXTURE_3D;
498    case GLSL_SAMPLER_DIM_CUBE:
499       return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
500    case GLSL_SAMPLER_DIM_RECT:
501       return TGSI_TEXTURE_RECT;
502    case GLSL_SAMPLER_DIM_MS:
503       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
504    case GLSL_SAMPLER_DIM_BUF:
505       return TGSI_TEXTURE_BUFFER;
506    default:
507       unreachable("unknown sampler dim");
508    }
509 }
510 
511 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)512 tgsi_return_type_from_base_type(enum glsl_base_type type)
513 {
514    switch (type) {
515    case GLSL_TYPE_INT:
516       return TGSI_RETURN_TYPE_SINT;
517    case GLSL_TYPE_UINT:
518       return TGSI_RETURN_TYPE_UINT;
519    case GLSL_TYPE_FLOAT:
520       return TGSI_RETURN_TYPE_FLOAT;
521    default:
522       unreachable("unexpected texture type");
523    }
524 }
525 
526 static void
ntr_setup_uniforms(struct ntr_compile * c)527 ntr_setup_uniforms(struct ntr_compile *c)
528 {
529    nir_foreach_uniform_variable (var, c->s) {
530       if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
531           glsl_type_is_texture(glsl_without_array(var->type))) {
532          /* Don't use this size for the check for samplers -- arrays of structs
533           * containing samplers should be ignored, and just the separate lowered
534           * sampler uniform decl used.
535           */
536          int size = glsl_type_get_sampler_count(var->type) + glsl_type_get_texture_count(var->type);
537 
538          const struct glsl_type *stype = glsl_without_array(var->type);
539          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(
540             glsl_get_sampler_dim(stype), glsl_sampler_type_is_array(stype));
541          enum tgsi_return_type ret_type =
542             tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
543          for (int i = 0; i < size; i++) {
544             ureg_DECL_sampler_view(c->ureg, var->data.binding + i, target, ret_type, ret_type,
545                                    ret_type, ret_type);
546             ureg_DECL_sampler(c->ureg, var->data.binding + i);
547          }
548 
549          /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
550           * size declaration happens with other UBOs below.
551           */
552       }
553    }
554 
555    c->first_ubo = ~0;
556 
557    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
558    nir_foreach_variable_with_modes (var, c->s, nir_var_mem_ubo) {
559       int ubo = var->data.driver_location;
560       if (ubo == -1)
561          continue;
562 
563       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
564          c->first_ubo = MIN2(c->first_ubo, ubo);
565 
566       unsigned size = glsl_get_explicit_size(var->interface_type, false);
567       ubo_sizes[ubo] = size;
568    }
569 
570    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
571       if (ubo_sizes[i])
572          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
573    }
574 }
575 
576 static void
ntr_setup_registers(struct ntr_compile * c)577 ntr_setup_registers(struct ntr_compile *c)
578 {
579    assert(c->num_temps == 0);
580 
581    /* After that, allocate non-array regs in our virtual space that we'll
582     * register-allocate before ureg emit.
583     */
584    nir_foreach_reg_decl_safe (nir_reg, nir_shader_get_entrypoint(c->s)) {
585       assert(nir_intrinsic_num_array_elems(nir_reg) == 0);
586       unsigned num_components = nir_intrinsic_num_components(nir_reg);
587       unsigned index = nir_reg->def.index;
588 
589       struct ureg_dst decl;
590       uint32_t write_mask = BITFIELD_MASK(num_components);
591 
592       if (!ntr_try_store_reg_in_tgsi_output(c, &decl, nir_reg)) {
593          decl = ureg_writemask(ntr_temp(c), write_mask);
594       }
595       c->reg_temp[index] = decl;
596    }
597 }
598 
599 static struct ureg_src
ntr_get_load_const_src(struct ntr_compile * c,nir_load_const_instr * instr)600 ntr_get_load_const_src(struct ntr_compile *c, nir_load_const_instr *instr)
601 {
602    int num_components = instr->def.num_components;
603 
604    float values[4];
605    assert(instr->def.bit_size == 32);
606    for (int i = 0; i < num_components; i++)
607       values[i] = uif(instr->value[i].u32);
608 
609    return ureg_DECL_immediate(c->ureg, values, num_components);
610 }
611 
612 static struct ureg_src
ntr_reladdr(struct ntr_compile * c,struct ureg_src addr,int addr_index)613 ntr_reladdr(struct ntr_compile *c, struct ureg_src addr, int addr_index)
614 {
615    assert(addr_index < ARRAY_SIZE(c->addr_reg));
616 
617    for (int i = 0; i <= addr_index; i++) {
618       if (!c->addr_declared[i]) {
619          c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg), TGSI_WRITEMASK_X);
620          c->addr_declared[i] = true;
621       }
622    }
623 
624    ntr_ARL(c, c->addr_reg[addr_index], addr);
625    return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
626 }
627 
628 /* Forward declare for recursion with indirects */
629 static struct ureg_src ntr_get_src(struct ntr_compile *c, nir_src src);
630 
631 static struct ureg_src
ntr_get_chased_src(struct ntr_compile * c,nir_legacy_src * src)632 ntr_get_chased_src(struct ntr_compile *c, nir_legacy_src *src)
633 {
634    if (src->is_ssa) {
635       if (src->ssa->parent_instr->type == nir_instr_type_load_const)
636          return ntr_get_load_const_src(c, nir_instr_as_load_const(src->ssa->parent_instr));
637 
638       return c->ssa_temp[src->ssa->index];
639    } else {
640       struct ureg_dst reg_temp = c->reg_temp[src->reg.handle->index];
641       reg_temp.Index += src->reg.base_offset;
642 
643       if (src->reg.indirect) {
644          struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(src->reg.indirect));
645          return ureg_src_indirect(ureg_src(reg_temp), ntr_reladdr(c, offset, 0));
646       } else {
647          return ureg_src(reg_temp);
648       }
649    }
650 }
651 
652 static struct ureg_src
ntr_get_src(struct ntr_compile * c,nir_src src)653 ntr_get_src(struct ntr_compile *c, nir_src src)
654 {
655    nir_legacy_src chased = nir_legacy_chase_src(&src);
656    return ntr_get_chased_src(c, &chased);
657 }
658 
659 static struct ureg_src
ntr_get_alu_src(struct ntr_compile * c,nir_alu_instr * instr,int i)660 ntr_get_alu_src(struct ntr_compile *c, nir_alu_instr *instr, int i)
661 {
662    /* We only support 32-bit float modifiers.  The only other modifier type
663     * officially supported by TGSI is 32-bit integer negates, but even those are
664     * broken on virglrenderer, so skip lowering all integer and f64 float mods.
665     *
666     * The lower_fabs requests that we not have native source modifiers
667     * for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
668     */
669    nir_legacy_alu_src src = nir_legacy_chase_alu_src(&instr->src[i], !c->lower_fabs);
670    struct ureg_src usrc = ntr_get_chased_src(c, &src.src);
671 
672    usrc = ureg_swizzle(usrc, src.swizzle[0], src.swizzle[1], src.swizzle[2], src.swizzle[3]);
673 
674    if (src.fabs)
675       usrc = ureg_abs(usrc);
676    if (src.fneg)
677       usrc = ureg_negate(usrc);
678 
679    return usrc;
680 }
681 
682 /* Reswizzles a source so that the unset channels in the write mask still refer
683  * to one of the channels present in the write mask.
684  */
685 static struct ureg_src
ntr_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)686 ntr_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
687 {
688    assert(write_mask);
689    int first_chan = ffs(write_mask) - 1;
690    return ureg_swizzle(src, (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
691                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
692                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
693                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
694 }
695 
696 static struct ureg_dst
ntr_get_ssa_def_decl(struct ntr_compile * c,nir_def * ssa)697 ntr_get_ssa_def_decl(struct ntr_compile *c, nir_def *ssa)
698 {
699    uint32_t writemask;
700    /* Fix writemask for nir_intrinsic_load_ubo_vec4 according to uses. */
701    if (ssa->parent_instr->type == nir_instr_type_intrinsic &&
702        nir_instr_as_intrinsic(ssa->parent_instr)->intrinsic == nir_intrinsic_load_ubo_vec4)
703       writemask = nir_def_components_read(ssa);
704    else
705       writemask = BITSET_MASK(ssa->num_components);
706 
707    struct ureg_dst dst;
708    if (!ntr_try_store_ssa_in_tgsi_output(c, &dst, ssa))
709       dst = ntr_temp(c);
710 
711    c->ssa_temp[ssa->index] = ntr_swizzle_for_write_mask(ureg_src(dst), writemask);
712 
713    return ureg_writemask(dst, writemask);
714 }
715 
716 static struct ureg_dst
ntr_get_chased_dest_decl(struct ntr_compile * c,nir_legacy_dest * dest)717 ntr_get_chased_dest_decl(struct ntr_compile *c, nir_legacy_dest *dest)
718 {
719    if (dest->is_ssa)
720       return ntr_get_ssa_def_decl(c, dest->ssa);
721    else
722       return c->reg_temp[dest->reg.handle->index];
723 }
724 
725 static struct ureg_dst
ntr_get_chased_dest(struct ntr_compile * c,nir_legacy_dest * dest)726 ntr_get_chased_dest(struct ntr_compile *c, nir_legacy_dest *dest)
727 {
728    struct ureg_dst dst = ntr_get_chased_dest_decl(c, dest);
729 
730    if (!dest->is_ssa) {
731       dst.Index += dest->reg.base_offset;
732 
733       if (dest->reg.indirect) {
734          struct ureg_src offset = ntr_get_src(c, nir_src_for_ssa(dest->reg.indirect));
735          dst = ureg_dst_indirect(dst, ntr_reladdr(c, offset, 0));
736       }
737    }
738 
739    return dst;
740 }
741 
742 static struct ureg_dst
ntr_get_dest(struct ntr_compile * c,nir_def * def)743 ntr_get_dest(struct ntr_compile *c, nir_def *def)
744 {
745    nir_legacy_dest chased = nir_legacy_chase_dest(def);
746    return ntr_get_chased_dest(c, &chased);
747 }
748 
749 static struct ureg_dst
ntr_get_alu_dest(struct ntr_compile * c,nir_def * def)750 ntr_get_alu_dest(struct ntr_compile *c, nir_def *def)
751 {
752    nir_legacy_alu_dest chased = nir_legacy_chase_alu_dest(def);
753    struct ureg_dst dst = ntr_get_chased_dest(c, &chased.dest);
754 
755    if (chased.fsat)
756       dst.Saturate = true;
757 
758    /* Only registers get write masks */
759    if (chased.dest.is_ssa)
760       return dst;
761 
762    return ureg_writemask(dst, chased.write_mask);
763 }
764 
765 /* For an SSA dest being populated by a constant src, replace the storage with
766  * a copy of the ureg_src.
767  */
768 static void
ntr_store_def(struct ntr_compile * c,nir_def * def,struct ureg_src src)769 ntr_store_def(struct ntr_compile *c, nir_def *def, struct ureg_src src)
770 {
771    if (!src.Indirect && !src.DimIndirect) {
772       switch (src.File) {
773       case TGSI_FILE_IMMEDIATE:
774       case TGSI_FILE_INPUT:
775       case TGSI_FILE_CONSTANT:
776       case TGSI_FILE_SYSTEM_VALUE:
777          c->ssa_temp[def->index] = src;
778          return;
779       }
780    }
781 
782    ntr_MOV(c, ntr_get_ssa_def_decl(c, def), src);
783 }
784 
785 static void
ntr_store(struct ntr_compile * c,nir_def * def,struct ureg_src src)786 ntr_store(struct ntr_compile *c, nir_def *def, struct ureg_src src)
787 {
788    nir_legacy_dest chased = nir_legacy_chase_dest(def);
789 
790    if (chased.is_ssa)
791       ntr_store_def(c, chased.ssa, src);
792    else {
793       struct ureg_dst dst = ntr_get_chased_dest(c, &chased);
794       ntr_MOV(c, dst, src);
795    }
796 }
797 
798 static void
ntr_emit_scalar(struct ntr_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)799 ntr_emit_scalar(struct ntr_compile *c, unsigned tgsi_op, struct ureg_dst dst, struct ureg_src src0,
800                 struct ureg_src src1)
801 {
802    unsigned i;
803 
804    /* POW is the only 2-operand scalar op. */
805    if (tgsi_op != TGSI_OPCODE_POW)
806       src1 = src0;
807 
808    for (i = 0; i < 4; i++) {
809       if (dst.WriteMask & (1 << i)) {
810          ntr_insn(c, tgsi_op, ureg_writemask(dst, 1 << i), ureg_scalar(src0, i),
811                   ureg_scalar(src1, i), ureg_src_undef(), ureg_src_undef());
812       }
813    }
814 }
815 
816 static void
ntr_emit_alu(struct ntr_compile * c,nir_alu_instr * instr)817 ntr_emit_alu(struct ntr_compile *c, nir_alu_instr *instr)
818 {
819    struct ureg_src src[4];
820    struct ureg_dst dst;
821    unsigned i;
822    int num_srcs = nir_op_infos[instr->op].num_inputs;
823 
824    /* Don't try to translate folded fsat since their source won't be valid */
825    if (instr->op == nir_op_fsat && nir_legacy_fsat_folds(instr))
826       return;
827 
828    c->precise = instr->exact;
829 
830    assert(num_srcs <= ARRAY_SIZE(src));
831    for (i = 0; i < num_srcs; i++)
832       src[i] = ntr_get_alu_src(c, instr, i);
833    for (; i < ARRAY_SIZE(src); i++)
834       src[i] = ureg_src_undef();
835 
836    dst = ntr_get_alu_dest(c, &instr->def);
837 
838    static enum tgsi_opcode op_map[] = {
839       [nir_op_mov] = TGSI_OPCODE_MOV,
840 
841       [nir_op_fdot2_replicated] = TGSI_OPCODE_DP2,
842       [nir_op_fdot3_replicated] = TGSI_OPCODE_DP3,
843       [nir_op_fdot4_replicated] = TGSI_OPCODE_DP4,
844       [nir_op_ffloor] = TGSI_OPCODE_FLR,
845       [nir_op_ffract] = TGSI_OPCODE_FRC,
846       [nir_op_fceil] = TGSI_OPCODE_CEIL,
847       [nir_op_fround_even] = TGSI_OPCODE_ROUND,
848 
849       [nir_op_slt] = TGSI_OPCODE_SLT,
850       [nir_op_sge] = TGSI_OPCODE_SGE,
851       [nir_op_seq] = TGSI_OPCODE_SEQ,
852       [nir_op_sne] = TGSI_OPCODE_SNE,
853 
854       [nir_op_ftrunc] = TGSI_OPCODE_TRUNC,
855       [nir_op_fadd] = TGSI_OPCODE_ADD,
856       [nir_op_fmul] = TGSI_OPCODE_MUL,
857 
858       [nir_op_fmin] = TGSI_OPCODE_MIN,
859       [nir_op_fmax] = TGSI_OPCODE_MAX,
860       [nir_op_ffma] = TGSI_OPCODE_MAD,
861    };
862 
863    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op] > 0) {
864       /* The normal path for NIR to TGSI ALU op translation */
865       ntr_insn(c, op_map[instr->op], dst, src[0], src[1], src[2], src[3]);
866    } else {
867       /* Special cases for NIR to TGSI ALU op translation. */
868 
869       /* TODO: Use something like the ntr_store() path for the MOV calls so we
870        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
871        */
872 
873       switch (instr->op) {
874       case nir_op_fabs:
875          /* Try to eliminate */
876          if (!c->lower_fabs && nir_legacy_float_mod_folds(instr))
877             break;
878 
879          if (c->lower_fabs)
880             ntr_MAX(c, dst, src[0], ureg_negate(src[0]));
881          else
882             ntr_MOV(c, dst, ureg_abs(src[0]));
883          break;
884 
885       case nir_op_fsat:
886          ntr_MOV(c, ureg_saturate(dst), src[0]);
887          break;
888 
889       case nir_op_fneg:
890          /* Try to eliminate */
891          if (nir_legacy_float_mod_folds(instr))
892             break;
893 
894          ntr_MOV(c, dst, ureg_negate(src[0]));
895          break;
896 
897          /* NOTE: TGSI 32-bit math ops have the old "one source channel
898           * replicated to all dst channels" behavior, while 64 is normal mapping
899           * of src channels to dst.
900           */
901       case nir_op_frcp:
902          ntr_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
903          break;
904 
905       case nir_op_frsq:
906          ntr_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
907          break;
908 
909       case nir_op_fexp2:
910          ntr_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
911          break;
912 
913       case nir_op_flog2:
914          ntr_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
915          break;
916 
917       case nir_op_fsin:
918          ntr_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
919          break;
920 
921       case nir_op_fcos:
922          ntr_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
923          break;
924 
925       case nir_op_fsub:
926          ntr_ADD(c, dst, src[0], ureg_negate(src[1]));
927          break;
928 
929       case nir_op_fmod:
930          unreachable("should be handled by .lower_fmod = true");
931          break;
932 
933       case nir_op_fpow:
934          ntr_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
935          break;
936 
937       case nir_op_flrp:
938          ntr_LRP(c, dst, src[2], src[1], src[0]);
939          break;
940 
941       case nir_op_fcsel:
942          /* Implement this as CMP(-abs(src0), src1, src2). */
943          ntr_CMP(c, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
944          break;
945 
946       case nir_op_fcsel_gt:
947          ntr_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
948          break;
949 
950       case nir_op_fcsel_ge:
951          /* Implement this as if !(src0 < 0.0) was identical to src0 >= 0.0. */
952          ntr_CMP(c, dst, src[0], src[2], src[1]);
953          break;
954 
955       case nir_op_vec4:
956       case nir_op_vec3:
957       case nir_op_vec2:
958          unreachable("covered by nir_lower_vec_to_movs()");
959 
960       default:
961          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
962          unreachable("Unknown NIR opcode");
963       }
964    }
965 
966    c->precise = false;
967 }
968 
969 static struct ureg_src
ntr_ureg_src_indirect(struct ntr_compile * c,struct ureg_src usrc,nir_src src,int addr_reg)970 ntr_ureg_src_indirect(struct ntr_compile *c, struct ureg_src usrc, nir_src src, int addr_reg)
971 {
972    if (nir_src_is_const(src)) {
973       usrc.Index += ntr_src_as_uint(c, src);
974       return usrc;
975    } else {
976       return ureg_src_indirect(usrc, ntr_reladdr(c, ntr_get_src(c, src), addr_reg));
977    }
978 }
979 
980 static struct ureg_dst
ntr_ureg_dst_indirect(struct ntr_compile * c,struct ureg_dst dst,nir_src src)981 ntr_ureg_dst_indirect(struct ntr_compile *c, struct ureg_dst dst, nir_src src)
982 {
983    if (nir_src_is_const(src)) {
984       dst.Index += ntr_src_as_uint(c, src);
985       return dst;
986    } else {
987       return ureg_dst_indirect(dst, ntr_reladdr(c, ntr_get_src(c, src), 0));
988    }
989 }
990 
991 static struct ureg_dst
ntr_ureg_dst_dimension_indirect(struct ntr_compile * c,struct ureg_dst udst,nir_src src)992 ntr_ureg_dst_dimension_indirect(struct ntr_compile *c, struct ureg_dst udst, nir_src src)
993 {
994    if (nir_src_is_const(src)) {
995       return ureg_dst_dimension(udst, ntr_src_as_uint(c, src));
996    } else {
997       return ureg_dst_dimension_indirect(udst, ntr_reladdr(c, ntr_get_src(c, src), 1), 0);
998    }
999 }
1000 /* Some load operations in NIR will have a fractional offset that we need to
1001  * swizzle down before storing to the result register.
1002  */
1003 static struct ureg_src
ntr_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1004 ntr_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1005 {
1006    return ureg_swizzle(src, frac, frac + MIN2(num_components - 1, 1),
1007                        frac + MIN2(num_components - 1, 2), frac + MIN2(num_components - 1, 3));
1008 }
1009 
1010 static void
ntr_emit_load_ubo(struct ntr_compile * c,nir_intrinsic_instr * instr)1011 ntr_emit_load_ubo(struct ntr_compile *c, nir_intrinsic_instr *instr)
1012 {
1013    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1014 
1015    struct ureg_dst addr_temp = ureg_dst_undef();
1016 
1017    if (nir_src_is_const(instr->src[0])) {
1018       src = ureg_src_dimension(src, ntr_src_as_uint(c, instr->src[0]));
1019    } else {
1020       /* virglrenderer requires that indirect UBO references have the UBO
1021        * array's base index in the Index field, not added to the indirect
1022        * address.
1023        *
1024        * Many nir intrinsics have a base address const value for the start of
1025        * their array indirection, but load_ubo doesn't.  We fake it by
1026        * subtracting it off here.
1027        */
1028       addr_temp = ntr_temp(c);
1029       ntr_UADD(c, addr_temp, ntr_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1030       src = ureg_src_dimension_indirect(src, ntr_reladdr(c, ureg_src(addr_temp), 1), c->first_ubo);
1031    }
1032 
1033    /* !pipe_caps.load_constbuf: Just emit it as a vec4 reference to the const
1034     * file.
1035     */
1036    src.Index = nir_intrinsic_base(instr);
1037 
1038    if (nir_src_is_const(instr->src[1])) {
1039       src.Index += ntr_src_as_uint(c, instr->src[1]);
1040    } else {
1041       src = ureg_src_indirect(src, ntr_reladdr(c, ntr_get_src(c, instr->src[1]), 0));
1042    }
1043 
1044    int start_component = nir_intrinsic_component(instr);
1045 
1046    src = ntr_shift_by_frac(src, start_component, instr->num_components);
1047 
1048    ntr_store(c, &instr->def, src);
1049 }
1050 
1051 static void
ntr_emit_load_input(struct ntr_compile * c,nir_intrinsic_instr * instr)1052 ntr_emit_load_input(struct ntr_compile *c, nir_intrinsic_instr *instr)
1053 {
1054    uint32_t frac = nir_intrinsic_component(instr);
1055    uint32_t num_components = instr->num_components;
1056    unsigned base = nir_intrinsic_base(instr);
1057    struct ureg_src input;
1058    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1059 
1060    if (c->s->info.stage == MESA_SHADER_VERTEX) {
1061       input = ureg_DECL_vs_input(c->ureg, base);
1062       for (int i = 1; i < semantics.num_slots; i++)
1063          ureg_DECL_vs_input(c->ureg, base + i);
1064    } else {
1065       input = c->input_index_map[base];
1066    }
1067 
1068    input = ntr_shift_by_frac(input, frac, num_components);
1069 
1070    switch (instr->intrinsic) {
1071    case nir_intrinsic_load_input:
1072       input = ntr_ureg_src_indirect(c, input, instr->src[0], 0);
1073       ntr_store(c, &instr->def, input);
1074       break;
1075 
1076    case nir_intrinsic_load_interpolated_input: {
1077       input = ntr_ureg_src_indirect(c, input, instr->src[1], 0);
1078 
1079       nir_intrinsic_instr *bary_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1080 
1081       switch (bary_instr->intrinsic) {
1082       case nir_intrinsic_load_barycentric_pixel:
1083       case nir_intrinsic_load_barycentric_sample:
1084          /* For these, we know that the barycentric load matches the
1085           * interpolation on the input declaration, so we can use it directly.
1086           */
1087          ntr_store(c, &instr->def, input);
1088          break;
1089 
1090       case nir_intrinsic_load_barycentric_centroid:
1091          /* If the input was declared centroid, then there's no need to
1092           * emit the extra TGSI interp instruction, we can just read the
1093           * input.
1094           */
1095          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1096             ntr_store(c, &instr->def, input);
1097          } else {
1098             ntr_INTERP_CENTROID(c, ntr_get_dest(c, &instr->def), input);
1099          }
1100          break;
1101 
1102       case nir_intrinsic_load_barycentric_at_sample:
1103          /* We stored the sample in the fake "bary" dest. */
1104          ntr_INTERP_SAMPLE(c, ntr_get_dest(c, &instr->def), input, ntr_get_src(c, instr->src[0]));
1105          break;
1106 
1107       case nir_intrinsic_load_barycentric_at_offset:
1108          /* We stored the offset in the fake "bary" dest. */
1109          ntr_INTERP_OFFSET(c, ntr_get_dest(c, &instr->def), input, ntr_get_src(c, instr->src[0]));
1110          break;
1111 
1112       default:
1113          unreachable("bad barycentric interp intrinsic\n");
1114       }
1115       break;
1116    }
1117 
1118    default:
1119       unreachable("bad load input intrinsic\n");
1120    }
1121 }
1122 
1123 static void
ntr_emit_store_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1124 ntr_emit_store_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1125 {
1126    struct ureg_src src = ntr_get_src(c, instr->src[0]);
1127 
1128    if (src.File == TGSI_FILE_OUTPUT) {
1129       /* If our src is the output file, that's an indication that we were able
1130        * to emit the output stores in the generating instructions and we have
1131        * nothing to do here.
1132        */
1133       return;
1134    }
1135 
1136    uint32_t frac;
1137    struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1138 
1139    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1140       out = ntr_ureg_dst_indirect(c, out, instr->src[2]);
1141       out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1142    } else {
1143       out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1144    }
1145 
1146    uint8_t swizzle[4] = {0, 0, 0, 0};
1147    for (int i = frac; i < 4; i++) {
1148       if (out.WriteMask & (1 << i))
1149          swizzle[i] = i - frac;
1150    }
1151 
1152    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1153 
1154    ntr_MOV(c, out, src);
1155 }
1156 
1157 static void
ntr_emit_load_output(struct ntr_compile * c,nir_intrinsic_instr * instr)1158 ntr_emit_load_output(struct ntr_compile *c, nir_intrinsic_instr *instr)
1159 {
1160    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1161 
1162    /* ntr_try_store_in_tgsi_output() optimization is not valid if normal
1163     * load_output is present.
1164     */
1165    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
1166           (c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
1167 
1168    uint32_t frac;
1169    struct ureg_dst out = ntr_output_decl(c, instr, &frac);
1170 
1171    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
1172       out = ntr_ureg_dst_indirect(c, out, instr->src[1]);
1173       out = ntr_ureg_dst_dimension_indirect(c, out, instr->src[0]);
1174    } else {
1175       out = ntr_ureg_dst_indirect(c, out, instr->src[0]);
1176    }
1177 
1178    struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1179    struct ureg_src out_src = ureg_src(out);
1180 
1181    /* Don't swizzling unavailable channels of the output in the writemasked-out
1182     * components. Avoids compile failures in virglrenderer with
1183     * TESS_LEVEL_INNER.
1184     */
1185    int fill_channel = ffs(dst.WriteMask) - 1;
1186    uint8_t swizzles[4] = {0, 1, 2, 3};
1187    for (int i = 0; i < 4; i++)
1188       if (!(dst.WriteMask & (1 << i)))
1189          swizzles[i] = fill_channel;
1190    out_src = ureg_swizzle(out_src, swizzles[0], swizzles[1], swizzles[2], swizzles[3]);
1191 
1192    if (semantics.fb_fetch_output)
1193       ntr_FBFETCH(c, dst, out_src);
1194    else
1195       ntr_MOV(c, dst, out_src);
1196 }
1197 
1198 static void
ntr_emit_load_sysval(struct ntr_compile * c,nir_intrinsic_instr * instr)1199 ntr_emit_load_sysval(struct ntr_compile *c, nir_intrinsic_instr *instr)
1200 {
1201    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1202    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1203    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1204 
1205    /* virglrenderer doesn't like references to channels of the sysval that
1206     * aren't defined, even if they aren't really read.  (GLSL compile fails on
1207     * gl_NumWorkGroups.w, for example).
1208     */
1209    uint32_t write_mask = BITSET_MASK(instr->def.num_components);
1210    sv = ntr_swizzle_for_write_mask(sv, write_mask);
1211 
1212    /* TGSI and NIR define these intrinsics as always loading ints, but they can
1213     * still appear on hardware with non-native-integers fragment shaders using
1214     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
1215     * means that we actually want floats instead.
1216     */
1217    switch (instr->intrinsic) {
1218    case nir_intrinsic_load_vertex_id:
1219    case nir_intrinsic_load_instance_id:
1220       ntr_U2F(c, ntr_get_dest(c, &instr->def), sv);
1221       return;
1222 
1223    default:
1224       break;
1225    }
1226 
1227    ntr_store(c, &instr->def, sv);
1228 }
1229 
1230 static void
ntr_emit_intrinsic(struct ntr_compile * c,nir_intrinsic_instr * instr)1231 ntr_emit_intrinsic(struct ntr_compile *c, nir_intrinsic_instr *instr)
1232 {
1233    switch (instr->intrinsic) {
1234    case nir_intrinsic_load_ubo:
1235    case nir_intrinsic_load_ubo_vec4:
1236       ntr_emit_load_ubo(c, instr);
1237       break;
1238 
1239       /* Vertex */
1240    case nir_intrinsic_load_draw_id:
1241    case nir_intrinsic_load_invocation_id:
1242    case nir_intrinsic_load_frag_coord:
1243    case nir_intrinsic_load_point_coord:
1244    case nir_intrinsic_load_front_face:
1245       ntr_emit_load_sysval(c, instr);
1246       break;
1247 
1248    case nir_intrinsic_load_input:
1249    case nir_intrinsic_load_per_vertex_input:
1250    case nir_intrinsic_load_interpolated_input:
1251       ntr_emit_load_input(c, instr);
1252       break;
1253 
1254    case nir_intrinsic_store_output:
1255    case nir_intrinsic_store_per_vertex_output:
1256       ntr_emit_store_output(c, instr);
1257       break;
1258 
1259    case nir_intrinsic_load_output:
1260    case nir_intrinsic_load_per_vertex_output:
1261       ntr_emit_load_output(c, instr);
1262       break;
1263 
1264    case nir_intrinsic_terminate:
1265       ntr_KILL(c);
1266       break;
1267 
1268    case nir_intrinsic_terminate_if: {
1269       struct ureg_src cond = ureg_scalar(ntr_get_src(c, instr->src[0]), 0);
1270       /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1271       ntr_KILL_IF(c, ureg_negate(cond));
1272       break;
1273    }
1274       /* In TGSI we don't actually generate the barycentric coords, and emit
1275        * interp intrinsics later.  However, we do need to store the
1276        * load_barycentric_at_* argument so that we can use it at that point.
1277        */
1278    case nir_intrinsic_load_barycentric_pixel:
1279    case nir_intrinsic_load_barycentric_centroid:
1280    case nir_intrinsic_load_barycentric_sample:
1281       break;
1282    case nir_intrinsic_load_barycentric_at_sample:
1283    case nir_intrinsic_load_barycentric_at_offset:
1284       ntr_store(c, &instr->def, ntr_get_src(c, instr->src[0]));
1285       break;
1286 
1287    case nir_intrinsic_ddx:
1288    case nir_intrinsic_ddx_coarse:
1289       ntr_DDX(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0]));
1290       return;
1291    case nir_intrinsic_ddy:
1292    case nir_intrinsic_ddy_coarse:
1293       ntr_DDY(c, ntr_get_dest(c, &instr->def), ntr_get_src(c, instr->src[0]));
1294       return;
1295 
1296    case nir_intrinsic_decl_reg:
1297    case nir_intrinsic_load_reg:
1298    case nir_intrinsic_load_reg_indirect:
1299    case nir_intrinsic_store_reg:
1300    case nir_intrinsic_store_reg_indirect:
1301       /* fully consumed */
1302       break;
1303 
1304    default:
1305       fprintf(stderr, "Unknown intrinsic: ");
1306       nir_print_instr(&instr->instr, stderr);
1307       fprintf(stderr, "\n");
1308       break;
1309    }
1310 }
1311 
1312 struct ntr_tex_operand_state {
1313    struct ureg_src srcs[4];
1314    unsigned i;
1315 };
1316 
1317 static void
ntr_push_tex_arg(struct ntr_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_tex_operand_state * s)1318 ntr_push_tex_arg(struct ntr_compile *c, nir_tex_instr *instr, nir_tex_src_type tex_src_type,
1319                  struct ntr_tex_operand_state *s)
1320 {
1321    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1322    if (tex_src < 0)
1323       return;
1324 
1325    nir_src *src = &instr->src[tex_src].src;
1326    s->srcs[s->i++] = ntr_get_src(c, *src);
1327 }
1328 
1329 static void
ntr_emit_texture(struct ntr_compile * c,nir_tex_instr * instr)1330 ntr_emit_texture(struct ntr_compile *c, nir_tex_instr *instr)
1331 {
1332    struct ureg_dst dst = ntr_get_dest(c, &instr->def);
1333    assert(!instr->is_shadow);
1334    enum tgsi_texture_type target =
1335       tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array);
1336    unsigned tex_opcode;
1337 
1338    int tex_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle);
1339    int sampler_handle_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
1340 
1341    struct ureg_src sampler;
1342    if (tex_handle_src >= 0 && sampler_handle_src >= 0) {
1343       /* It seems we can't get separate tex/sampler on GL, just use one of the handles */
1344       sampler = ntr_get_src(c, instr->src[tex_handle_src].src);
1345       assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
1346    } else {
1347       assert(tex_handle_src == -1 && sampler_handle_src == -1);
1348       sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1349       int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1350       if (sampler_src >= 0) {
1351          struct ureg_src reladdr = ntr_get_src(c, instr->src[sampler_src].src);
1352          sampler = ureg_src_indirect(sampler, ntr_reladdr(c, reladdr, 2));
1353       }
1354    }
1355 
1356    switch (instr->op) {
1357    case nir_texop_tex:
1358       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
1359           MAX2(instr->coord_components, 2) + instr->is_shadow)
1360          tex_opcode = TGSI_OPCODE_TXP;
1361       else
1362          tex_opcode = TGSI_OPCODE_TEX;
1363       break;
1364    case nir_texop_txl:
1365       tex_opcode = TGSI_OPCODE_TXL;
1366       break;
1367    case nir_texop_txb:
1368       tex_opcode = TGSI_OPCODE_TXB;
1369       break;
1370    case nir_texop_txd:
1371       tex_opcode = TGSI_OPCODE_TXD;
1372       break;
1373    case nir_texop_txs:
1374       tex_opcode = TGSI_OPCODE_TXQ;
1375       break;
1376    case nir_texop_tg4:
1377       tex_opcode = TGSI_OPCODE_TG4;
1378       break;
1379    case nir_texop_query_levels:
1380       tex_opcode = TGSI_OPCODE_TXQ;
1381       break;
1382    case nir_texop_lod:
1383       tex_opcode = TGSI_OPCODE_LODQ;
1384       break;
1385    case nir_texop_texture_samples:
1386       tex_opcode = TGSI_OPCODE_TXQS;
1387       break;
1388    default:
1389       unreachable("unsupported tex op");
1390    }
1391 
1392    struct ntr_tex_operand_state s = {.i = 0};
1393    ntr_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
1394    ntr_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
1395 
1396    /* non-coord arg for TXQ */
1397    if (tex_opcode == TGSI_OPCODE_TXQ) {
1398       ntr_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1399       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
1400        * scalar
1401        */
1402       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
1403    }
1404 
1405    if (s.i > 1) {
1406       if (tex_opcode == TGSI_OPCODE_TEX)
1407          tex_opcode = TGSI_OPCODE_TEX2;
1408       if (tex_opcode == TGSI_OPCODE_TXB)
1409          tex_opcode = TGSI_OPCODE_TXB2;
1410       if (tex_opcode == TGSI_OPCODE_TXL)
1411          tex_opcode = TGSI_OPCODE_TXL2;
1412    }
1413 
1414    if (instr->op == nir_texop_txd) {
1415       /* Derivs appear in their own src args */
1416       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
1417       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1418       s.srcs[s.i++] = ntr_get_src(c, instr->src[ddx].src);
1419       s.srcs[s.i++] = ntr_get_src(c, instr->src[ddy].src);
1420    }
1421 
1422    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1423       if (c->screen->caps.tgsi_tg4_component_in_swizzle) {
1424          sampler = ureg_scalar(sampler, instr->component);
1425          s.srcs[s.i++] = ureg_src_undef();
1426       } else {
1427          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
1428       }
1429    }
1430 
1431    s.srcs[s.i++] = sampler;
1432 
1433    enum tgsi_return_type tex_type;
1434    switch (instr->dest_type) {
1435    case nir_type_float32:
1436       tex_type = TGSI_RETURN_TYPE_FLOAT;
1437       break;
1438    case nir_type_int32:
1439       tex_type = TGSI_RETURN_TYPE_SINT;
1440       break;
1441    case nir_type_uint32:
1442       tex_type = TGSI_RETURN_TYPE_UINT;
1443       break;
1444    default:
1445       unreachable("unknown texture type");
1446    }
1447 
1448    struct ureg_dst tex_dst;
1449    if (instr->op == nir_texop_query_levels)
1450       tex_dst = ureg_writemask(ntr_temp(c), TGSI_WRITEMASK_W);
1451    else
1452       tex_dst = dst;
1453 
1454    while (s.i < 4)
1455       s.srcs[s.i++] = ureg_src_undef();
1456 
1457    struct ntr_insn *insn =
1458       ntr_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
1459    insn->tex_target = target;
1460    insn->tex_return_type = tex_type;
1461    insn->is_tex = true;
1462 
1463    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
1464    if (tex_offset_src >= 0) {
1465       struct ureg_src offset = ntr_get_src(c, instr->src[tex_offset_src].src);
1466 
1467       insn->tex_offset[0].File = offset.File;
1468       insn->tex_offset[0].Index = offset.Index;
1469       insn->tex_offset[0].SwizzleX = offset.SwizzleX;
1470       insn->tex_offset[0].SwizzleY = offset.SwizzleY;
1471       insn->tex_offset[0].SwizzleZ = offset.SwizzleZ;
1472       insn->tex_offset[0].Padding = 0;
1473    }
1474 
1475    if (nir_tex_instr_has_explicit_tg4_offsets(instr)) {
1476       for (uint8_t i = 0; i < 4; ++i) {
1477          struct ureg_src imm =
1478             ureg_imm2i(c->ureg, instr->tg4_offsets[i][0], instr->tg4_offsets[i][1]);
1479          insn->tex_offset[i].File = imm.File;
1480          insn->tex_offset[i].Index = imm.Index;
1481          insn->tex_offset[i].SwizzleX = imm.SwizzleX;
1482          insn->tex_offset[i].SwizzleY = imm.SwizzleY;
1483          insn->tex_offset[i].SwizzleZ = imm.SwizzleZ;
1484       }
1485    }
1486 
1487    if (instr->op == nir_texop_query_levels)
1488       ntr_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
1489 }
1490 
1491 static void
ntr_emit_jump(struct ntr_compile * c,nir_jump_instr * jump)1492 ntr_emit_jump(struct ntr_compile *c, nir_jump_instr *jump)
1493 {
1494    switch (jump->type) {
1495    case nir_jump_break:
1496       ntr_BRK(c);
1497       break;
1498 
1499    case nir_jump_continue:
1500       ntr_CONT(c);
1501       break;
1502 
1503    default:
1504       fprintf(stderr, "Unknown jump instruction: ");
1505       nir_print_instr(&jump->instr, stderr);
1506       fprintf(stderr, "\n");
1507       abort();
1508    }
1509 }
1510 
1511 static void
ntr_emit_ssa_undef(struct ntr_compile * c,nir_undef_instr * instr)1512 ntr_emit_ssa_undef(struct ntr_compile *c, nir_undef_instr *instr)
1513 {
1514    /* Nothing to do but make sure that we have some storage to deref. */
1515    (void)ntr_get_ssa_def_decl(c, &instr->def);
1516 }
1517 
1518 static void
ntr_emit_instr(struct ntr_compile * c,nir_instr * instr)1519 ntr_emit_instr(struct ntr_compile *c, nir_instr *instr)
1520 {
1521    switch (instr->type) {
1522    case nir_instr_type_deref:
1523       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
1524       break;
1525 
1526    case nir_instr_type_alu:
1527       ntr_emit_alu(c, nir_instr_as_alu(instr));
1528       break;
1529 
1530    case nir_instr_type_intrinsic:
1531       ntr_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1532       break;
1533 
1534    case nir_instr_type_load_const:
1535       /* Nothing to do here, as load consts are done directly from
1536        * ntr_get_src() (since many constant NIR srcs will often get folded
1537        * directly into a register file index instead of as a TGSI src).
1538        */
1539       break;
1540 
1541    case nir_instr_type_tex:
1542       ntr_emit_texture(c, nir_instr_as_tex(instr));
1543       break;
1544 
1545    case nir_instr_type_jump:
1546       ntr_emit_jump(c, nir_instr_as_jump(instr));
1547       break;
1548 
1549    case nir_instr_type_undef:
1550       ntr_emit_ssa_undef(c, nir_instr_as_undef(instr));
1551       break;
1552 
1553    default:
1554       fprintf(stderr, "Unknown NIR instr type: ");
1555       nir_print_instr(instr, stderr);
1556       fprintf(stderr, "\n");
1557       abort();
1558    }
1559 }
1560 
1561 static void
ntr_emit_if(struct ntr_compile * c,nir_if * if_stmt)1562 ntr_emit_if(struct ntr_compile *c, nir_if *if_stmt)
1563 {
1564    ntr_IF(c, c->if_cond);
1565 
1566    ntr_emit_cf_list(c, &if_stmt->then_list);
1567 
1568    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
1569       ntr_ELSE(c);
1570       ntr_emit_cf_list(c, &if_stmt->else_list);
1571    }
1572 
1573    ntr_ENDIF(c);
1574 }
1575 
1576 static void
ntr_emit_loop(struct ntr_compile * c,nir_loop * loop)1577 ntr_emit_loop(struct ntr_compile *c, nir_loop *loop)
1578 {
1579    assert(!nir_loop_has_continue_construct(loop));
1580    ntr_BGNLOOP(c);
1581    ntr_emit_cf_list(c, &loop->body);
1582    ntr_ENDLOOP(c);
1583 }
1584 
1585 static void
ntr_emit_block(struct ntr_compile * c,nir_block * block)1586 ntr_emit_block(struct ntr_compile *c, nir_block *block)
1587 {
1588    struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
1589    c->cur_block = ntr_block;
1590 
1591    nir_foreach_instr (instr, block) {
1592       ntr_emit_instr(c, instr);
1593 
1594       /* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntr_OPCODE(). */
1595       if (ureg_get_instruction_number(c->ureg) != 0) {
1596          fprintf(stderr, "Emitted ureg insn during: ");
1597          nir_print_instr(instr, stderr);
1598          fprintf(stderr, "\n");
1599          unreachable("emitted ureg insn");
1600       }
1601    }
1602 
1603    /* Set up the if condition for ntr_emit_if(), which we have to do before
1604     * freeing up the temps (the "if" is treated as inside the block for liveness
1605     * purposes, despite not being an instruction)
1606     *
1607     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
1608     * looks at all of .xyzw.  No harm in working around the bug.
1609     */
1610    nir_if *nif = nir_block_get_following_if(block);
1611    if (nif)
1612       c->if_cond = ureg_scalar(ntr_get_src(c, nif->condition), TGSI_SWIZZLE_X);
1613 }
1614 
1615 static void
ntr_emit_cf_list(struct ntr_compile * c,struct exec_list * list)1616 ntr_emit_cf_list(struct ntr_compile *c, struct exec_list *list)
1617 {
1618    foreach_list_typed (nir_cf_node, node, node, list) {
1619       switch (node->type) {
1620       case nir_cf_node_block:
1621          ntr_emit_block(c, nir_cf_node_as_block(node));
1622          break;
1623 
1624       case nir_cf_node_if:
1625          ntr_emit_if(c, nir_cf_node_as_if(node));
1626          break;
1627 
1628       case nir_cf_node_loop:
1629          ntr_emit_loop(c, nir_cf_node_as_loop(node));
1630          break;
1631 
1632       default:
1633          unreachable("unknown CF type");
1634       }
1635    }
1636 }
1637 
1638 static void
ntr_emit_block_ureg(struct ntr_compile * c,struct nir_block * block)1639 ntr_emit_block_ureg(struct ntr_compile *c, struct nir_block *block)
1640 {
1641    struct ntr_block *ntr_block = ntr_block_from_nir(c, block);
1642 
1643    /* Emit the ntr insns to tgsi_ureg. */
1644    util_dynarray_foreach (&ntr_block->insns, struct ntr_insn, insn) {
1645       const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(insn->opcode);
1646 
1647       switch (insn->opcode) {
1648       case TGSI_OPCODE_IF:
1649          ureg_IF(c->ureg, insn->src[0], &c->cf_label);
1650          break;
1651 
1652       case TGSI_OPCODE_ELSE:
1653          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
1654          ureg_ELSE(c->ureg, &c->cf_label);
1655          c->current_if_else = c->cf_label;
1656          break;
1657 
1658       case TGSI_OPCODE_ENDIF:
1659          ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
1660          ureg_ENDIF(c->ureg);
1661          break;
1662 
1663       case TGSI_OPCODE_BGNLOOP:
1664          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
1665           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
1666           * with a need.
1667           */
1668          ureg_BGNLOOP(c->ureg, &c->cf_label);
1669          break;
1670 
1671       case TGSI_OPCODE_ENDLOOP:
1672          ureg_ENDLOOP(c->ureg, &c->cf_label);
1673          break;
1674 
1675       default:
1676          if (insn->is_tex) {
1677             int num_offsets = 0;
1678             for (int i = 0; i < ARRAY_SIZE(insn->tex_offset); i++) {
1679                if (insn->tex_offset[i].File != TGSI_FILE_NULL)
1680                   num_offsets = i + 1;
1681             }
1682             ureg_tex_insn(c->ureg, insn->opcode, insn->dst, opcode_info->num_dst, insn->tex_target,
1683                           insn->tex_return_type, insn->tex_offset, num_offsets, insn->src,
1684                           opcode_info->num_src);
1685          } else {
1686             ureg_insn(c->ureg, insn->opcode, insn->dst, opcode_info->num_dst, insn->src,
1687                       opcode_info->num_src, insn->precise);
1688          }
1689       }
1690    }
1691 }
1692 
1693 static void
ntr_emit_if_ureg(struct ntr_compile * c,nir_if * if_stmt)1694 ntr_emit_if_ureg(struct ntr_compile *c, nir_if *if_stmt)
1695 {
1696    /* Note: the last block emitted our IF opcode. */
1697 
1698    int if_stack = c->current_if_else;
1699    c->current_if_else = c->cf_label;
1700 
1701    /* Either the then or else block includes the ENDIF, which will fix up the
1702     * IF(/ELSE)'s label for jumping
1703     */
1704    ntr_emit_cf_list_ureg(c, &if_stmt->then_list);
1705    ntr_emit_cf_list_ureg(c, &if_stmt->else_list);
1706 
1707    c->current_if_else = if_stack;
1708 }
1709 
1710 static void
ntr_emit_cf_list_ureg(struct ntr_compile * c,struct exec_list * list)1711 ntr_emit_cf_list_ureg(struct ntr_compile *c, struct exec_list *list)
1712 {
1713    foreach_list_typed (nir_cf_node, node, node, list) {
1714       switch (node->type) {
1715       case nir_cf_node_block:
1716          ntr_emit_block_ureg(c, nir_cf_node_as_block(node));
1717          break;
1718 
1719       case nir_cf_node_if:
1720          ntr_emit_if_ureg(c, nir_cf_node_as_if(node));
1721          break;
1722 
1723       case nir_cf_node_loop:
1724          /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
1725           * does reference BGNLOOP's.  Follow the former behavior unless something comes up
1726           * with a need.
1727           */
1728          ntr_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
1729          break;
1730 
1731       default:
1732          unreachable("unknown CF type");
1733       }
1734    }
1735 }
1736 
1737 static void
ntr_emit_impl(struct ntr_compile * c,nir_function_impl * impl)1738 ntr_emit_impl(struct ntr_compile *c, nir_function_impl *impl)
1739 {
1740    c->impl = impl;
1741 
1742    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
1743    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
1744 
1745    /* Set up the struct ntr_blocks to put insns in */
1746    c->blocks = _mesa_pointer_hash_table_create(c);
1747    nir_foreach_block (block, impl) {
1748       struct ntr_block *ntr_block = rzalloc(c->blocks, struct ntr_block);
1749       util_dynarray_init(&ntr_block->insns, ntr_block);
1750       _mesa_hash_table_insert(c->blocks, block, ntr_block);
1751    }
1752 
1753    ntr_setup_registers(c);
1754 
1755    c->cur_block = ntr_block_from_nir(c, nir_start_block(impl));
1756    ntr_setup_inputs(c);
1757    ntr_setup_outputs(c);
1758    ntr_setup_uniforms(c);
1759 
1760    /* Emit the ntr insns */
1761    ntr_emit_cf_list(c, &impl->body);
1762 
1763    ntr_allocate_regs_unoptimized(c, impl);
1764 
1765    /* Turn the ntr insns into actual TGSI tokens */
1766    ntr_emit_cf_list_ureg(c, &impl->body);
1767 
1768    ralloc_free(c->liveness);
1769    c->liveness = NULL;
1770 }
1771 
1772 static int
type_size(const struct glsl_type * type,bool bindless)1773 type_size(const struct glsl_type *type, bool bindless)
1774 {
1775    return glsl_count_attribute_slots(type, false);
1776 }
1777 
1778 /* Allow vectorizing of ALU instructions.
1779  */
1780 static uint8_t
ntr_should_vectorize_instr(const nir_instr * instr,const void * data)1781 ntr_should_vectorize_instr(const nir_instr *instr, const void *data)
1782 {
1783    if (instr->type != nir_instr_type_alu)
1784       return 0;
1785 
1786    return 4;
1787 }
1788 
1789 static bool
ntr_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)1790 ntr_should_vectorize_io(unsigned align, unsigned bit_size, unsigned num_components,
1791                         unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
1792                         void *data)
1793 {
1794    if (bit_size != 32)
1795       return false;
1796 
1797    /* Our offset alignment should always be at least 4 bytes */
1798    if (align < 4)
1799       return false;
1800 
1801    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
1802     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
1803     * need this restriction.
1804     */
1805    unsigned worst_start_component = align == 4 ? 3 : align / 4;
1806    if (worst_start_component + num_components > 4)
1807       return false;
1808 
1809    return true;
1810 }
1811 
1812 static nir_variable_mode
ntr_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)1813 ntr_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
1814 {
1815    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
1816    unsigned indirect_mask = nir_var_shader_in | nir_var_shader_out;
1817 
1818    if (!screen->get_shader_param(screen, pipe_stage, PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
1819       indirect_mask |= nir_var_function_temp;
1820    }
1821 
1822    return indirect_mask;
1823 }
1824 
1825 struct ntr_lower_tex_state {
1826    nir_scalar channels[8];
1827    unsigned i;
1828 };
1829 
1830 static void
nir_to_rc_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntr_lower_tex_state * s)1831 nir_to_rc_lower_tex_instr_arg(nir_builder *b, nir_tex_instr *instr, nir_tex_src_type tex_src_type,
1832                               struct ntr_lower_tex_state *s)
1833 {
1834    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1835    if (tex_src < 0)
1836       return;
1837 
1838    nir_def *def = instr->src[tex_src].src.ssa;
1839    for (int i = 0; i < def->num_components; i++) {
1840       s->channels[s->i++] = nir_get_scalar(def, i);
1841    }
1842 
1843    nir_tex_instr_remove_src(instr, tex_src);
1844 }
1845 
1846 /**
1847  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
1848  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
1849  * manage it on our own, and may lead to more vectorization.
1850  */
1851 static bool
nir_to_rc_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)1852 nir_to_rc_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
1853 {
1854    if (instr->type != nir_instr_type_tex)
1855       return false;
1856 
1857    nir_tex_instr *tex = nir_instr_as_tex(instr);
1858 
1859    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
1860       return false;
1861 
1862    b->cursor = nir_before_instr(instr);
1863 
1864    struct ntr_lower_tex_state s = {0};
1865 
1866    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
1867    /* We always have at least two slots for the coordinate, even on 1D. */
1868    s.i = MAX2(s.i, 2);
1869 
1870    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
1871    s.i = MAX2(s.i, 3);
1872 
1873    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
1874 
1875    /* XXX: LZ */
1876    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
1877    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
1878    nir_to_rc_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
1879 
1880    /* No need to pack undefs in unused channels of the tex instr */
1881    while (!s.channels[s.i - 1].def)
1882       s.i--;
1883 
1884    /* Instead of putting undefs in the unused slots of the vecs, just put in
1885     * another used channel.  Otherwise, we'll get unnecessary moves into
1886     * registers.
1887     */
1888    assert(s.channels[0].def != NULL);
1889    for (int i = 1; i < s.i; i++) {
1890       if (!s.channels[i].def)
1891          s.channels[i] = s.channels[0];
1892    }
1893 
1894    nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_vec_scalars(b, s.channels, MIN2(s.i, 4)));
1895    if (s.i > 4)
1896       nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_vec_scalars(b, &s.channels[4], s.i - 4));
1897 
1898    return true;
1899 }
1900 
1901 static bool
nir_to_rc_lower_tex(nir_shader * s)1902 nir_to_rc_lower_tex(nir_shader *s)
1903 {
1904    return nir_shader_instructions_pass(s, nir_to_rc_lower_tex_instr, nir_metadata_control_flow,
1905                                        NULL);
1906 }
1907 
1908 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
1909 static void
nir_to_rc_lower_txp(nir_shader * s)1910 nir_to_rc_lower_txp(nir_shader *s)
1911 {
1912    nir_lower_tex_options lower_tex_options = {
1913       .lower_txp = 0,
1914    };
1915 
1916    nir_foreach_block (block, nir_shader_get_entrypoint(s)) {
1917       nir_foreach_instr (instr, block) {
1918          if (instr->type != nir_instr_type_tex)
1919             continue;
1920          nir_tex_instr *tex = nir_instr_as_tex(instr);
1921 
1922          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
1923             continue;
1924 
1925          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
1926          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 ||
1927                         s->info.stage != MESA_SHADER_FRAGMENT;
1928          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
1929 
1930          /* We can do TXP for any tex (not txg) where we can fit all the
1931           * coordinates and comparator and projector in one vec4 without any
1932           * other modifiers to add on.
1933           *
1934           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
1935           * if we get any funny projectors then we just blow them all away.
1936           */
1937          if (tex->op != nir_texop_tex || has_lod || has_offset ||
1938              (tex->coord_components >= 3 && has_compare))
1939             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
1940       }
1941    }
1942 
1943    /* nir_lower_tex must be run even if no options are set, because we need the
1944     * LOD to be set for query_levels and for non-fragment shaders.
1945     */
1946    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
1947 }
1948 
1949 /**
1950  * Translates the NIR shader to TGSI.
1951  *
1952  * This requires some lowering of the NIR shader to prepare it for translation.
1953  * We take ownership of the NIR shader passed, returning a reference to the new
1954  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
1955  */
1956 const void *
nir_to_rc(struct nir_shader * s,struct pipe_screen * screen,struct r300_fragment_program_external_state state)1957 nir_to_rc(struct nir_shader *s, struct pipe_screen *screen,
1958           struct r300_fragment_program_external_state state)
1959 {
1960    struct ntr_compile *c;
1961    const void *tgsi_tokens;
1962    bool is_r500 = r300_screen(screen)->caps.is_r500;
1963    c = rzalloc(NULL, struct ntr_compile);
1964    c->screen = screen;
1965    c->lower_fabs = !is_r500 && s->info.stage == MESA_SHADER_VERTEX;
1966 
1967    if (s->info.stage == MESA_SHADER_FRAGMENT) {
1968       if (is_r500) {
1969          NIR_PASS_V(s, r300_transform_fs_trig_input);
1970       }
1971    } else if (r300_screen(screen)->caps.has_tcl) {
1972       if (is_r500) {
1973          /* Only nine should set both NTT shader name and
1974           * use_legacy_math_rules and D3D9 already mandates
1975           * the proper range for the trigonometric inputs.
1976           */
1977          if (!s->info.use_legacy_math_rules || !(s->info.name && !strcmp("TTN", s->info.name))) {
1978             NIR_PASS_V(s, r300_transform_vs_trig_input);
1979          }
1980       } else {
1981          if (r300_screen(screen)->caps.is_r400) {
1982             NIR_PASS_V(s, r300_transform_vs_trig_input);
1983          }
1984       }
1985    }
1986 
1987    /* Lower array indexing on FS inputs.  Since we don't set
1988     * ureg->supports_any_inout_decl_range, the TGSI input decls will be split to
1989     * elements by ureg, and so dynamically indexing them would be invalid.
1990     * Ideally we would set that ureg flag based on
1991     * PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE, but can't due to mesa/st
1992     * splitting NIR VS outputs to elements even if the FS doesn't get the
1993     * corresponding splitting, and virgl depends on TGSI across link boundaries
1994     * having matching declarations.
1995     */
1996    if (s->info.stage == MESA_SHADER_FRAGMENT) {
1997       NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
1998       NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1999    }
2000 
2001    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size,
2002               nir_lower_io_use_interpolated_input_intrinsics);
2003 
2004    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2005       /* Shadow lowering. */
2006       int num_texture_states = state.sampler_state_count;
2007       if (num_texture_states > 0) {
2008          nir_lower_tex_shadow_swizzle tex_swizzle[PIPE_MAX_SHADER_SAMPLER_VIEWS];
2009          enum compare_func tex_compare_func[PIPE_MAX_SHADER_SAMPLER_VIEWS];
2010 
2011          for (unsigned i = 0; i < num_texture_states; i++) {
2012             tex_compare_func[i] = state.unit[i].texture_compare_func;
2013             tex_swizzle[i].swizzle_r = GET_SWZ(state.unit[i].texture_swizzle, 0);
2014             tex_swizzle[i].swizzle_g = GET_SWZ(state.unit[i].texture_swizzle, 1);
2015             tex_swizzle[i].swizzle_b = GET_SWZ(state.unit[i].texture_swizzle, 2);
2016             tex_swizzle[i].swizzle_a = GET_SWZ(state.unit[i].texture_swizzle, 3);
2017          }
2018          NIR_PASS_V(s, nir_lower_tex_shadow, num_texture_states, tex_compare_func,
2019                     tex_swizzle, true);
2020       }
2021 
2022       nir_to_rc_lower_txp(s);
2023       NIR_PASS_V(s, nir_to_rc_lower_tex);
2024    }
2025 
2026    bool progress;
2027    do {
2028       progress = false;
2029       NIR_PASS(progress, s, nir_opt_algebraic);
2030       NIR_PASS(progress, s, nir_opt_constant_folding);
2031    } while (progress);
2032 
2033    do {
2034       progress = false;
2035       NIR_PASS(progress, s, nir_opt_algebraic_late);
2036       if (progress) {
2037          NIR_PASS_V(s, nir_copy_prop);
2038          NIR_PASS_V(s, nir_opt_dce);
2039          NIR_PASS_V(s, nir_opt_cse);
2040       }
2041    } while (progress);
2042 
2043    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2044       NIR_PASS_V(s, r300_nir_prepare_presubtract);
2045    }
2046 
2047    NIR_PASS_V(s, nir_lower_int_to_float);
2048    NIR_PASS_V(s, nir_copy_prop);
2049    NIR_PASS_V(s, r300_nir_post_integer_lowering);
2050    NIR_PASS_V(s, nir_lower_bool_to_float, is_r500 || s->info.stage == MESA_SHADER_FRAGMENT);
2051    /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
2052    NIR_PASS_V(s, nir_copy_prop);
2053    /* CSE cleanup after late ftrunc lowering. */
2054    NIR_PASS_V(s, nir_opt_cse);
2055    /* At this point we need to clean;
2056     *  a) fcsel_gt that come from the ftrunc lowering on R300,
2057     *  b) all flavours of fcsels that read three different temp sources on R500.
2058     */
2059    if (s->info.stage == MESA_SHADER_VERTEX) {
2060       if (is_r500)
2061          NIR_PASS_V(s, r300_nir_lower_fcsel_r500);
2062       else
2063          NIR_PASS_V(s, r300_nir_lower_fcsel_r300);
2064       NIR_PASS_V(s, r300_nir_lower_flrp);
2065    } else {
2066       NIR_PASS_V(s, r300_nir_lower_comparison_fs);
2067    }
2068    NIR_PASS_V(s, r300_nir_opt_algebraic_late);
2069    NIR_PASS_V(s, nir_opt_dce);
2070    NIR_PASS_V(s, nir_opt_shrink_vectors, false);
2071    NIR_PASS_V(s, nir_opt_dce);
2072 
2073    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
2074                                nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
2075 
2076    NIR_PASS_V(s, nir_opt_move, move_all);
2077    NIR_PASS_V(s, nir_move_vec_src_uses_to_dest, true);
2078    /* Late vectorizing after nir_move_vec_src_uses_to_dest helps instructions but
2079     * increases register usage. Testing shows this is beneficial only in VS.
2080     */
2081    if (s->info.stage == MESA_SHADER_VERTEX)
2082       NIR_PASS_V(s, nir_opt_vectorize, ntr_should_vectorize_instr, NULL);
2083 
2084    NIR_PASS_V(s, nir_convert_from_ssa, true, false);
2085    NIR_PASS_V(s, nir_lower_vec_to_regs, NULL, NULL);
2086 
2087    /* locals_to_reg_intrinsics will leave dead derefs that are good to clean up.
2088     */
2089    NIR_PASS_V(s, nir_lower_locals_to_regs, 32);
2090    NIR_PASS_V(s, nir_opt_dce);
2091 
2092    /* See comment in ntr_get_alu_src for supported modifiers */
2093    NIR_PASS_V(s, nir_legacy_trivialize, !c->lower_fabs);
2094 
2095    if (NIR_DEBUG(TGSI)) {
2096       fprintf(stderr, "NIR before translation to TGSI:\n");
2097       nir_print_shader(s, stderr);
2098    }
2099 
2100    c->s = s;
2101    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2102    ureg_setup_shader_info(c->ureg, &s->info);
2103    if (s->info.use_legacy_math_rules && screen->caps.legacy_math_rules)
2104       ureg_property(c->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
2105 
2106    if (s->info.stage == MESA_SHADER_FRAGMENT) {
2107       /* The draw module's polygon stipple layer doesn't respect the chosen
2108        * coordinate mode, so leave it as unspecified unless we're actually
2109        * reading the position in the shader already.  See
2110        * gl-2.1-polygon-stipple-fs on softpipe.
2111        */
2112       if ((s->info.inputs_read & VARYING_BIT_POS) ||
2113           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2114          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2115                        s->info.fs.origin_upper_left ? TGSI_FS_COORD_ORIGIN_UPPER_LEFT
2116                                                     : TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2117 
2118          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2119                        s->info.fs.pixel_center_integer ? TGSI_FS_COORD_PIXEL_CENTER_INTEGER
2120                                                        : TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2121       }
2122    }
2123    /* Emit the main function */
2124    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2125    ntr_emit_impl(c, impl);
2126    ureg_END(c->ureg);
2127 
2128    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2129 
2130    if (NIR_DEBUG(TGSI)) {
2131       fprintf(stderr, "TGSI after translation from NIR:\n");
2132       tgsi_dump(tgsi_tokens, 0);
2133    }
2134 
2135    ureg_destroy(c->ureg);
2136 
2137    ralloc_free(c);
2138    ralloc_free(s);
2139 
2140    return tgsi_tokens;
2141 }
2142