• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors (Collabora):
25  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26  */
27 
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/glsl_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32 
33 #include "bifrost/disassemble.h"
34 #include "panfrost/lib/pan_props.h"
35 #include "valhall/disassemble.h"
36 #include "valhall/va_compiler.h"
37 #include "bi_builder.h"
38 #include "bi_quirks.h"
39 #include "bifrost_compile.h"
40 #include "bifrost_nir.h"
41 #include "compiler.h"
42 
43 /* clang-format off */
44 static const struct debug_named_value bifrost_debug_options[] = {
45    {"msgs",       BIFROST_DBG_MSGS,		   "Print debug messages"},
46    {"shaders",    BIFROST_DBG_SHADERS,	   "Dump shaders in NIR and MIR"},
47    {"shaderdb",   BIFROST_DBG_SHADERDB,	"Print statistics"},
48    {"verbose",    BIFROST_DBG_VERBOSE,	   "Disassemble verbosely"},
49    {"internal",   BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
50    {"nosched",    BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
51    {"nopsched",   BIFROST_DBG_NOPSCHED,   "Disable scheduling for pressure"},
52    {"inorder",    BIFROST_DBG_INORDER, 	"Force in-order bundling"},
53    {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
54    {"noopt",      BIFROST_DBG_NOOPT,      "Skip optimization passes"},
55    {"noidvs",     BIFROST_DBG_NOIDVS,     "Disable IDVS"},
56    {"nosb",       BIFROST_DBG_NOSB,       "Disable scoreboarding"},
57    {"nopreload",  BIFROST_DBG_NOPRELOAD,  "Disable message preloading"},
58    {"spill",      BIFROST_DBG_SPILL,      "Test register spilling"},
59    DEBUG_NAMED_VALUE_END
60 };
61 /* clang-format on */
62 
63 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
64                             bifrost_debug_options, 0)
65 
66 /* How many bytes are prefetched by the Bifrost shader core. From the final
67  * clause of the shader, this range must be valid instructions or zero. */
68 #define BIFROST_SHADER_PREFETCH 128
69 
70 int bifrost_debug = 0;
71 
72 #define DBG(fmt, ...)                                                          \
73    do {                                                                        \
74       if (bifrost_debug & BIFROST_DBG_MSGS)                                    \
75          fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__);    \
76    } while (0)
77 
78 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
79 
80 static bi_index
bi_preload(bi_builder * b,unsigned reg)81 bi_preload(bi_builder *b, unsigned reg)
82 {
83    if (bi_is_null(b->shader->preloaded[reg])) {
84       /* Insert at the beginning of the shader */
85       bi_builder b_ = *b;
86       b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
87 
88       /* Cache the result */
89       b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
90    }
91 
92    return b->shader->preloaded[reg];
93 }
94 
95 static bi_index
bi_coverage(bi_builder * b)96 bi_coverage(bi_builder *b)
97 {
98    if (bi_is_null(b->shader->coverage))
99       b->shader->coverage = bi_preload(b, 60);
100 
101    return b->shader->coverage;
102 }
103 
104 /*
105  * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
106  * changed from Bifrost to Valhall. Provide helpers that smooth over the
107  * architectural difference.
108  */
109 static inline bi_index
bi_vertex_id(bi_builder * b)110 bi_vertex_id(bi_builder *b)
111 {
112    return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
113 }
114 
115 static inline bi_index
bi_instance_id(bi_builder * b)116 bi_instance_id(bi_builder *b)
117 {
118    return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
119 }
120 
121 static inline bi_index
bi_draw_id(bi_builder * b)122 bi_draw_id(bi_builder *b)
123 {
124    assert(b->shader->arch >= 9);
125    return bi_preload(b, 62);
126 }
127 
128 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)129 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
130 {
131    bi_instr *branch = bi_jump(b, bi_zero());
132 
133    switch (instr->type) {
134    case nir_jump_break:
135       branch->branch_target = b->shader->break_block;
136       break;
137    case nir_jump_continue:
138       branch->branch_target = b->shader->continue_block;
139       break;
140    default:
141       unreachable("Unhandled jump type");
142    }
143 
144    bi_block_add_successor(b->shader->current_block, branch->branch_target);
145    b->shader->current_block->unconditional_jumps = true;
146 }
147 
148 /* Builds a 64-bit hash table key for an index */
149 static uint64_t
bi_index_to_key(bi_index idx)150 bi_index_to_key(bi_index idx)
151 {
152    static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
153 
154    uint64_t key = 0;
155    memcpy(&key, &idx, sizeof(idx));
156    return key;
157 }
158 
159 /*
160  * Extract a single channel out of a vector source. We split vectors with SPLIT
161  * so we can use the split components directly, without emitting an extract.
162  * This has advantages of RA, as the split can usually be optimized away.
163  */
164 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)165 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
166 {
167    bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
168                                                       bi_index_to_key(vec));
169 
170    /* No extract needed for scalars.
171     *
172     * This is a bit imprecise, but actual bugs (missing splits for vectors)
173     * should be caught by the following assertion. It is too difficult to
174     * ensure bi_extract is only called for real vectors.
175     */
176    if (components == NULL && channel == 0)
177       return vec;
178 
179    assert(components != NULL && "missing bi_cache_collect()");
180    return components[channel];
181 }
182 
183 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)184 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
185 {
186    /* Lifetime of a hash table entry has to be at least as long as the table */
187    bi_index *channels = ralloc_array(b->shader, bi_index, n);
188    memcpy(channels, s, sizeof(bi_index) * n);
189 
190    _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
191                                channels);
192 }
193 
194 /*
195  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
196  * split pseudo-instruction.
197  *
198  * Pre-condition: dests is filled with bi_null().
199  */
200 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)201 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
202 {
203    /* Setup the destinations */
204    for (unsigned i = 0; i < n; ++i) {
205       dests[i] = bi_temp(b->shader);
206    }
207 
208    /* Emit the split */
209    if (n == 1) {
210       bi_mov_i32_to(b, dests[0], vec);
211    } else {
212       bi_instr *I = bi_split_i32_to(b, n, vec);
213 
214       bi_foreach_dest(I, j)
215          I->dest[j] = dests[j];
216    }
217 }
218 
219 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)220 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
221 {
222    bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
223    bi_emit_split_i32(b, dests, vec, n);
224    bi_cache_collect(b, vec, dests, n);
225 }
226 
227 /*
228  * Emit and cache a split for a vector of a given bitsize. The vector may not be
229  * composed of 32-bit words, but it will be split at 32-bit word boundaries.
230  */
231 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)232 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
233 {
234    bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
235 }
236 
237 static void
bi_split_def(bi_builder * b,nir_def * def)238 bi_split_def(bi_builder *b, nir_def *def)
239 {
240    bi_emit_cached_split(b, bi_def_index(def),
241                         def->bit_size * def->num_components);
242 }
243 
244 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)245 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
246 {
247    /* Special case: COLLECT of a single value is a scalar move */
248    if (n == 1)
249       return bi_mov_i32_to(b, dst, chan[0]);
250 
251    bi_instr *I = bi_collect_i32_to(b, dst, n);
252 
253    bi_foreach_src(I, i)
254       I->src[i] = chan[i];
255 
256    bi_cache_collect(b, dst, chan, n);
257    return I;
258 }
259 
260 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)261 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
262 {
263    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
264 }
265 
266 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)267 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
268                     bi_index s2)
269 {
270    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
271 }
272 
273 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)274 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
275 {
276    bi_index dst = bi_temp(b->shader);
277    bi_collect_v2i32_to(b, dst, s0, s1);
278    return dst;
279 }
280 
281 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)282 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
283 {
284    switch (intr->intrinsic) {
285    case nir_intrinsic_load_barycentric_centroid:
286    case nir_intrinsic_load_barycentric_sample:
287       return bi_preload(b, 61);
288 
289    /* Need to put the sample ID in the top 16-bits */
290    case nir_intrinsic_load_barycentric_at_sample:
291       return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
292                             bi_half(bi_src_index(&intr->src[0]), false));
293 
294    /* Interpret as 8:8 signed fixed point positions in pixels along X and
295     * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
296     * is the center of the pixel so we first fixup and then convert. For
297     * fp16 input:
298     *
299     * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
300     * f2i16((256 * (x, y)) + (128, 128)) =
301     * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
302     *
303     * For fp32 input, that lacks enough precision for MSAA 16x, but the
304     * idea is the same. FIXME: still doesn't pass
305     */
306    case nir_intrinsic_load_barycentric_at_offset: {
307       bi_index offset = bi_src_index(&intr->src[0]);
308       bi_index f16 = bi_null();
309       unsigned sz = nir_src_bit_size(intr->src[0]);
310 
311       if (sz == 16) {
312          f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
313       } else {
314          assert(sz == 32);
315          bi_index f[2];
316          for (unsigned i = 0; i < 2; ++i) {
317             f[i] =
318                bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
319                                   bi_imm_u32(8), BI_SPECIAL_NONE);
320          }
321 
322          f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
323       }
324 
325       return bi_v2f16_to_v2s16(b, f16);
326    }
327 
328    case nir_intrinsic_load_barycentric_pixel:
329    default:
330       return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
331    }
332 }
333 
334 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)335 bi_interp_for_intrinsic(nir_intrinsic_op op)
336 {
337    switch (op) {
338    case nir_intrinsic_load_barycentric_centroid:
339       return BI_SAMPLE_CENTROID;
340    case nir_intrinsic_load_barycentric_sample:
341    case nir_intrinsic_load_barycentric_at_sample:
342       return BI_SAMPLE_SAMPLE;
343    case nir_intrinsic_load_barycentric_at_offset:
344       return BI_SAMPLE_EXPLICIT;
345    case nir_intrinsic_load_barycentric_pixel:
346    default:
347       return BI_SAMPLE_CENTER;
348    }
349 }
350 
351 /* auto, 64-bit omitted */
352 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)353 bi_reg_fmt_for_nir(nir_alu_type T)
354 {
355    switch (T) {
356    case nir_type_float16:
357       return BI_REGISTER_FORMAT_F16;
358    case nir_type_float32:
359       return BI_REGISTER_FORMAT_F32;
360    case nir_type_int16:
361       return BI_REGISTER_FORMAT_S16;
362    case nir_type_uint16:
363       return BI_REGISTER_FORMAT_U16;
364    case nir_type_int32:
365       return BI_REGISTER_FORMAT_S32;
366    case nir_type_uint32:
367       return BI_REGISTER_FORMAT_U32;
368    default:
369       unreachable("Invalid type for register format");
370    }
371 }
372 
373 static bool
va_is_valid_const_narrow_index(bi_index idx)374 va_is_valid_const_narrow_index(bi_index idx)
375 {
376    if (idx.type != BI_INDEX_CONSTANT)
377       return false;
378 
379    unsigned index = pan_res_handle_get_index(idx.value);
380    unsigned table_index = pan_res_handle_get_table(idx.value);
381 
382    return index < 1024 && va_is_valid_const_table(table_index);
383 }
384 
385 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
386  * immediate to be used (which applies even if _IMM can't be used) */
387 
388 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)389 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
390                      unsigned max)
391 {
392    nir_src *offset = nir_get_io_offset_src(instr);
393 
394    if (!nir_src_is_const(*offset))
395       return false;
396 
397    *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
398    return (*immediate) < max;
399 }
400 
401 static bool
bi_is_imm_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate,unsigned max)402 bi_is_imm_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
403                       uint32_t *immediate, unsigned max)
404 {
405    nir_src *offset = nir_get_io_offset_src(instr);
406 
407    if (!nir_src_is_const(*offset))
408       return false;
409 
410    if (b->shader->arch >= 9) {
411       uint32_t res_handle =
412          nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
413       uint32_t table_index = pan_res_handle_get_table(res_handle);
414       uint32_t res_index = pan_res_handle_get_index(res_handle);
415 
416       if (!va_is_valid_const_table(table_index) || res_index >= max)
417          return false;
418 
419       *immediate = res_handle;
420       return true;
421    }
422 
423    return bi_is_intr_immediate(instr, immediate, max);
424 }
425 
426 static bool
bi_is_imm_var_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate)427 bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
428                           uint32_t *immediate)
429 {
430    unsigned max = b->shader->arch >= 9 ? 256 : 20;
431 
432    return bi_is_imm_desc_handle(b, instr, immediate, max);
433 }
434 
435 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
436                            unsigned *channel, unsigned count, unsigned bitsize);
437 
438 /* Bifrost's load instructions lack a component offset despite operating in
439  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
440  * but they may be unavoidable with separate shaders in use. To solve this, we
441  * lower to a larger load and an explicit copy of the desired components. */
442 
443 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)444 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
445 {
446    unsigned component = nir_intrinsic_component(instr);
447    unsigned nr = instr->num_components;
448    unsigned total = nr + component;
449    unsigned bitsize = instr->def.bit_size;
450 
451    assert(total <= 4 && "should be vec4");
452    bi_emit_cached_split(b, tmp, total * bitsize);
453 
454    if (component == 0)
455       return;
456 
457    bi_index srcs[] = {tmp, tmp, tmp};
458    unsigned channels[] = {component, component + 1, component + 2};
459 
460    bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
461                   instr->def.bit_size);
462 }
463 
464 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)465 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
466 {
467    bi_index vertex_id =
468       instr->intrinsic == nir_intrinsic_load_attribute_pan ?
469          bi_src_index(&instr->src[0]) :
470          bi_vertex_id(b);
471    bi_index instance_id =
472       instr->intrinsic == nir_intrinsic_load_attribute_pan ?
473          bi_src_index(&instr->src[1]) :
474          bi_instance_id(b);
475 
476    /* Disregard the signedness of an integer, since loading 32-bits into a
477     * 32-bit register should be bit exact so should not incur any clamping.
478     *
479     * If we are reading as a u32, then it must be paired with an integer (u32 or
480     * s32) source, so use .auto32 to disregard.
481     */
482    nir_alu_type T = nir_intrinsic_dest_type(instr);
483    assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
484    enum bi_register_format regfmt =
485       T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
486 
487    nir_src *offset = nir_get_io_offset_src(instr);
488    unsigned component = nir_intrinsic_component(instr);
489    enum bi_vecsize vecsize = (instr->num_components + component - 1);
490    unsigned imm_index = 0;
491    unsigned base = nir_intrinsic_base(instr);
492    bool constant = nir_src_is_const(*offset);
493    bool immediate = bi_is_imm_desc_handle(b, instr, &imm_index, 16);
494    bi_index dest =
495       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
496    bi_instr *I;
497 
498    if (immediate) {
499       I = bi_ld_attr_imm_to(b, dest, vertex_id, instance_id, regfmt,
500                             vecsize, pan_res_handle_get_index(imm_index));
501 
502       if (b->shader->arch >= 9)
503          I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
504    } else {
505       bi_index idx = bi_src_index(&instr->src[0]);
506 
507       if (constant)
508          idx = bi_imm_u32(imm_index);
509       else if (base != 0)
510          idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
511 
512       I = bi_ld_attr_to(b, dest, vertex_id, instance_id, idx, regfmt, vecsize);
513    }
514 
515    bi_copy_component(b, instr, dest);
516 }
517 
518 /*
519  * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
520  * come later, sparsely packed. This handles both linked and separable shaders
521  * with a common code path, with minimal keying only for desktop GL. Each slot
522  * consumes 16 bytes (TODO: fp16, partial vectors).
523  */
524 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)525 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
526 {
527    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
528    uint32_t mask = ctx->inputs->fixed_varying_mask;
529 
530    if (sem.location >= VARYING_SLOT_VAR0) {
531       unsigned nr_special = util_bitcount(mask);
532       unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
533 
534       return 16 * (nr_special + general_index);
535    } else {
536       return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
537    }
538 }
539 
540 /*
541  * Compute the offset in bytes of a varying with an immediate offset, adding the
542  * offset to the base computed above. Convenience method.
543  */
544 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)545 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
546 {
547    nir_src *src = nir_get_io_offset_src(intr);
548    assert(nir_src_is_const(*src) && "assumes immediate offset");
549 
550    return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
551 }
552 
553 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)554 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
555 {
556    enum bi_sample sample = BI_SAMPLE_CENTER;
557    enum bi_update update = BI_UPDATE_STORE;
558    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
559    bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
560    bi_index src0 = bi_null();
561 
562    unsigned component = nir_intrinsic_component(instr);
563    enum bi_vecsize vecsize = (instr->num_components + component - 1);
564    bi_index dest =
565       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
566 
567    unsigned sz = instr->def.bit_size;
568 
569    if (smooth) {
570       nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
571       assert(parent);
572 
573       sample = bi_interp_for_intrinsic(parent->intrinsic);
574       src0 = bi_varying_src0_for_barycentric(b, parent);
575 
576       assert(sz == 16 || sz == 32);
577       regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
578    } else {
579       assert(sz == 32);
580       regfmt = BI_REGISTER_FORMAT_U32;
581 
582       /* Valhall can't have bi_null() here, although the source is
583        * logically unused for flat varyings
584        */
585       if (b->shader->arch >= 9)
586          src0 = bi_preload(b, 61);
587 
588       /* Gather info as we go */
589       b->shader->info.bifrost->uses_flat_shading = true;
590    }
591 
592    nir_src *offset = nir_get_io_offset_src(instr);
593    unsigned imm_index = 0;
594    bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
595    unsigned base = nir_intrinsic_base(instr);
596 
597    /* Only use LD_VAR_BUF[_IMM] if explicitly told by the driver
598     * through a compiler input value, falling back to LD_VAR[_IMM] +
599     * Attribute Descriptors otherwise. */
600    bool use_ld_var_buf =
601       b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf;
602 
603    if (use_ld_var_buf) {
604       enum bi_source_format source_format =
605          smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
606 
607       if (immediate) {
608          /* Immediate index given in bytes. */
609          bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
610                               update, vecsize,
611                               bi_varying_offset(b->shader, instr));
612       } else {
613          bi_index idx = bi_src_index(offset);
614          /* Index needs to be in bytes, but NIR gives the index
615           * in slots. For now assume 16 bytes per element.
616           */
617          bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
618          unsigned vbase = bi_varying_base_bytes(b->shader, instr);
619 
620          if (vbase != 0)
621             idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
622 
623          bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
624                           source_format, update, vecsize);
625       }
626    } else {
627       /* On Valhall, ensure the table and index are valid for usage with
628        * immediate form when IDVS isn't used */
629       if (b->shader->arch >= 9)
630          immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
631                       pan_res_handle_get_index(base) < 256;
632 
633       if (immediate) {
634          bi_instr *I;
635 
636          if (smooth) {
637             I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
638                                  pan_res_handle_get_index(imm_index));
639          } else {
640             I =
641                bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
642                                      pan_res_handle_get_index(imm_index));
643          }
644 
645          /* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple
646           * Midgard-style ABI. */
647          if (b->shader->arch >= 9)
648             I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
649       } else {
650          bi_index idx = bi_src_index(offset);
651 
652          if (base != 0)
653             idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
654 
655          if (smooth)
656             bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
657          else
658             bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
659       }
660    }
661 
662    bi_copy_component(b, instr, dest);
663 }
664 
665 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)666 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
667                     unsigned count)
668 {
669    assert(1 <= count && count <= 4);
670 
671    bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
672 
673    for (unsigned i = 0; i < count; ++i) {
674       unsigned chan = channel ? channel[i] : 0;
675       unsigned lane = chan & 3;
676       bi_index raw_data = bi_extract(b, src[i], chan >> 2);
677 
678       /* On Bifrost, MKVEC.v4i8 cannot select b1 or b3 */
679       if (b->shader->arch < 9 && lane != 0 && lane != 2) {
680          bytes[i] = bi_byte(bi_rshift_or(b, 32, raw_data, bi_zero(),
681                                          bi_imm_u8(lane * 8), false),
682                             0);
683       } else {
684          bytes[i] = bi_byte(raw_data, lane);
685       }
686 
687       assert(b->shader->arch >= 9 || bytes[i].swizzle == BI_SWIZZLE_B0000 ||
688              bytes[i].swizzle == BI_SWIZZLE_B2222);
689    }
690 
691    if (b->shader->arch >= 9) {
692       bi_index vec = bi_zero();
693 
694       if (count >= 3)
695          vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
696 
697       return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
698    } else {
699       return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
700    }
701 }
702 
703 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)704 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
705                      unsigned count)
706 {
707    unsigned chan0 = channel ? channel[0] : 0;
708    bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
709    bi_index h0 = bi_half(w0, chan0 & 1);
710 
711    /* Zero extend */
712    if (count == 1)
713       return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
714 
715    /* Else, create a vector */
716    assert(count == 2);
717 
718    unsigned chan1 = channel ? channel[1] : 0;
719    bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
720    bi_index h1 = bi_half(w1, chan1 & 1);
721 
722    if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
723       return bi_mov_i32(b, w0);
724    else if (bi_is_word_equiv(w0, w1))
725       return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
726    else
727       return bi_mkvec_v2i16(b, h0, h1);
728 }
729 
730 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)731 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
732                unsigned count, unsigned bitsize)
733 {
734    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
735    unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
736    unsigned chan_per_word = 1 << shift;
737 
738    assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
739           "unnecessarily large vector should have been lowered");
740 
741    bi_index srcs[BI_MAX_VEC];
742 
743    for (unsigned i = 0; i < count; i += chan_per_word) {
744       unsigned rem = MIN2(count - i, chan_per_word);
745       unsigned *channel_offset = channel ? (channel + i) : NULL;
746 
747       if (bitsize == 32)
748          srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
749       else if (bitsize == 16)
750          srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
751       else
752          srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
753    }
754 
755    bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
756 }
757 
758 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)759 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
760                bi_index src1)
761 {
762    bi_instr *I;
763 
764    if (b->shader->arch >= 9) {
765       I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
766       I->seg = BI_SEG_UBO;
767    } else {
768       I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
769    }
770 
771    bi_emit_cached_split(b, dest0, bitsize);
772    return I;
773 }
774 
775 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)776 bi_load_sample_id_to(bi_builder *b, bi_index dst)
777 {
778    /* r61[16:23] contains the sampleID, mask it out. Upper bits
779     * seem to read garbage (despite being architecturally defined
780     * as zero), so use a 5-bit mask instead of 8-bits */
781 
782    bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
783                         bi_imm_u8(16), false);
784 }
785 
786 static bi_index
bi_load_sample_id(bi_builder * b)787 bi_load_sample_id(bi_builder *b)
788 {
789    bi_index sample_id = bi_temp(b->shader);
790    bi_load_sample_id_to(b, sample_id);
791    return sample_id;
792 }
793 
794 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)795 bi_pixel_indices(bi_builder *b, unsigned rt)
796 {
797    /* We want to load the current pixel. */
798    struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
799 
800    uint32_t indices_u32 = 0;
801    memcpy(&indices_u32, &pix, sizeof(indices_u32));
802    bi_index indices = bi_imm_u32(indices_u32);
803 
804    /* Sample index above is left as zero. For multisampling, we need to
805     * fill in the actual sample ID in the lower byte */
806 
807    if (b->shader->inputs->blend.nr_samples > 1)
808       indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
809 
810    return indices;
811 }
812 
813 /* Source color is passed through r0-r3, or r4-r7 for the second source when
814  * dual-source blending. Preload the corresponding vector.
815  */
816 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)817 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
818 {
819    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
820    unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
821    unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
822    assert(size == 16 || size == 32);
823 
824    bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
825                       bi_preload(b, base + 2), bi_preload(b, base + 3)};
826 
827    bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
828 }
829 
830 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)831 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
832                  nir_alu_type T2, unsigned rt)
833 {
834    /* Reads 2 or 4 staging registers to cover the input */
835    unsigned size = nir_alu_type_get_type_size(T);
836    unsigned size_2 = nir_alu_type_get_type_size(T2);
837    unsigned sr_count = (size <= 16) ? 2 : 4;
838    unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
839    const struct panfrost_compile_inputs *inputs = b->shader->inputs;
840    uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
841    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
842 
843    /* Workaround for NIR-to-TGSI */
844    if (b->shader->nir->info.fs.untyped_color_outputs)
845       regfmt = BI_REGISTER_FORMAT_AUTO;
846 
847    if (inputs->is_blend && inputs->blend.nr_samples > 1) {
848       /* Conversion descriptor comes from the compile inputs, pixel
849        * indices derived at run time based on sample ID */
850       bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
851                  bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
852    } else if (b->shader->inputs->is_blend) {
853       uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
854 
855       /* Blend descriptor comes from the compile inputs */
856       /* Put the result in r0 */
857 
858       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
859                   bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
860                   bi_null(), regfmt, sr_count, 0);
861    } else {
862       /* Blend descriptor comes from the FAU RAM. By convention, the
863        * return address on Bifrost is stored in r48 and will be used
864        * by the blend shader to jump back to the fragment shader */
865 
866       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
867                   bi_fau(BIR_FAU_BLEND_0 + rt, false),
868                   bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
869                   sr_count_2);
870    }
871 
872    assert(rt < 8);
873    b->shader->info.bifrost->blend[rt].type = T;
874 
875    if (T2)
876       b->shader->info.bifrost->blend_src1_type = T2;
877 }
878 
879 /* Blend shaders do not need to run ATEST since they are dependent on a
880  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
881  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
882  * there are no writes to the coverage mask. The latter two are satisfied for
883  * all blit shaders, so we just care about early-z, which blit shaders force
884  * iff they do not write depth or stencil */
885 
886 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)887 bi_skip_atest(bi_context *ctx, bool emit_zs)
888 {
889    return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
890 }
891 
892 static void
bi_emit_atest(bi_builder * b,bi_index alpha)893 bi_emit_atest(bi_builder *b, bi_index alpha)
894 {
895    b->shader->coverage =
896       bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
897    b->shader->emitted_atest = true;
898 }
899 
900 static bi_index
bi_src_color_vec4(bi_builder * b,nir_src * src,nir_alu_type T)901 bi_src_color_vec4(bi_builder *b, nir_src *src, nir_alu_type T)
902 {
903    unsigned num_components = nir_src_num_components(*src);
904    bi_index base = bi_src_index(src);
905 
906    /* short-circuit the common case */
907    if (num_components == 4)
908       return base;
909 
910    unsigned size = nir_alu_type_get_type_size(T);
911    assert(size == 16 || size == 32);
912 
913    bi_index src_vals[4];
914 
915    unsigned i;
916    for (i = 0; i < num_components; i++)
917       src_vals[i] = bi_extract(b, base, i);
918 
919    for (; i < 3; i++)
920       src_vals[i] = (size == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0);
921    src_vals[3] = (size == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0);
922    bi_index temp = bi_temp(b->shader);
923    bi_make_vec_to(b, temp, src_vals, NULL, 4, size);
924    return temp;
925 }
926 
927 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)928 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
929 {
930    bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
931 
932    unsigned writeout =
933       combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
934 
935    bool emit_blend = writeout & (PAN_WRITEOUT_C);
936    bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
937 
938    unsigned loc = nir_intrinsic_io_semantics(instr).location;
939    bi_index src0 = bi_src_index(&instr->src[0]);
940 
941    /* By ISA convention, the coverage mask is stored in R60. The store
942     * itself will be handled by a subsequent ATEST instruction */
943    if (loc == FRAG_RESULT_SAMPLE_MASK) {
944       b->shader->coverage = bi_extract(b, src0, 0);
945       return;
946    }
947 
948    /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
949     * value, but render target #0 might not be floating point. However the
950     * alpha value is only used for alpha-to-coverage, a stage which is
951     * skipped for pure integer framebuffers, so the issue is moot. */
952 
953    if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
954       nir_alu_type T = nir_intrinsic_src_type(instr);
955 
956       bi_index rgba = bi_src_index(&instr->src[0]);
957       bi_index alpha;
958 
959       if (nir_src_num_components(instr->src[0]) < 4) {
960          /* Don't read out-of-bounds */
961          alpha = bi_imm_f32(1.0);
962       } else if (T == nir_type_float16) {
963          alpha = bi_half(bi_extract(b, rgba, 1), true);
964       } else if (T == nir_type_float32) {
965          alpha = bi_extract(b, rgba, 3);
966       } else {
967          alpha = bi_dontcare(b);
968       }
969       bi_emit_atest(b, alpha);
970    }
971 
972    if (emit_zs) {
973       bi_index z = bi_dontcare(b), s = bi_dontcare(b);
974 
975       if (writeout & PAN_WRITEOUT_Z)
976          z = bi_src_index(&instr->src[2]);
977 
978       if (writeout & PAN_WRITEOUT_S)
979          s = bi_src_index(&instr->src[3]);
980 
981       b->shader->coverage =
982          bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
983                     writeout & PAN_WRITEOUT_Z);
984    }
985 
986    if (emit_blend) {
987       unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
988       bool dual = (writeout & PAN_WRITEOUT_2);
989       nir_alu_type T = nir_intrinsic_src_type(instr);
990       nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
991       bi_index color = bi_src_color_vec4(b, &instr->src[0], T);
992       bi_index color2 =
993          dual ? bi_src_color_vec4(b, &instr->src[4], T2) : bi_null();
994 
995       if (instr->intrinsic == nir_intrinsic_store_output &&
996           loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
997          assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
998 
999          unsigned rt_offs = nir_src_as_uint(instr->src[1]);
1000 
1001          assert(rt + rt_offs < 8 && "RT not in the [0-7] range");
1002          rt += rt_offs;
1003       }
1004 
1005       /* Explicit copy since BLEND inputs are precoloured to R0-R3,
1006        * TODO: maybe schedule around this or implement in RA as a
1007        * spill */
1008       bool has_mrt =
1009          (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
1010 
1011       if (has_mrt) {
1012          bi_index srcs[4] = {color, color, color, color};
1013          unsigned channels[4] = {0, 1, 2, 3};
1014          color = bi_temp(b->shader);
1015          bi_make_vec_to(
1016             b, color, srcs, channels, nir_src_num_components(instr->src[0]),
1017             nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
1018       }
1019 
1020       bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
1021    }
1022 
1023    if (b->shader->inputs->is_blend) {
1024       /* Jump back to the fragment shader, return address is stored
1025        * in r48 (see above). On Valhall, only jump if the address is
1026        * nonzero. The check is free there and it implements the "jump
1027        * to 0 terminates the blend shader" that's automatic on
1028        * Bifrost.
1029        */
1030       if (b->shader->arch >= 8)
1031          bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
1032       else
1033          bi_jump(b, bi_preload(b, 48));
1034    }
1035 }
1036 
1037 /**
1038  * In a vertex shader, is the specified variable a position output? These kinds
1039  * of outputs are written from position shaders when IDVS is enabled. All other
1040  * outputs are written from the varying shader.
1041  */
1042 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)1043 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
1044 {
1045    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1046 
1047    switch (sem.location) {
1048    case VARYING_SLOT_POS:
1049    case VARYING_SLOT_PSIZ:
1050    case VARYING_SLOT_LAYER:
1051       return idvs == BI_IDVS_VARYING;
1052    default:
1053       return idvs == BI_IDVS_POSITION;
1054    }
1055 }
1056 
1057 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)1058 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
1059 {
1060    enum bi_idvs_mode *idvs = data;
1061 
1062    if (instr->type != nir_instr_type_intrinsic)
1063       return false;
1064 
1065    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1066 
1067    if (intr->intrinsic != nir_intrinsic_store_output &&
1068        intr->intrinsic != nir_intrinsic_store_per_view_output)
1069       return false;
1070 
1071    if (bi_should_remove_store(intr, *idvs)) {
1072       nir_instr_remove(instr);
1073       return true;
1074    }
1075 
1076    return false;
1077 }
1078 
1079 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)1080 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
1081 {
1082    /* In principle we can do better for 16-bit. At the moment we require
1083     * 32-bit to permit the use of .auto, in order to force .u32 for flat
1084     * varyings, to handle internal TGSI shaders that set flat in the VS
1085     * but smooth in the FS */
1086 
1087    ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
1088    ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
1089    assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
1090    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
1091 
1092    unsigned imm_index = 0;
1093    bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
1094 
1095    /* Only look at the total components needed. In effect, we fill in all
1096     * the intermediate "holes" in the write mask, since we can't mask off
1097     * stores. Since nir_lower_io_to_temporaries ensures each varying is
1098     * written at most once, anything that's masked out is undefined, so it
1099     * doesn't matter what we write there. So we may as well do the
1100     * simplest thing possible. */
1101    unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1102    assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1103 
1104    bi_index data = bi_src_index(&instr->src[0]);
1105 
1106    /* To keep the vector dimensions consistent, we need to drop some
1107     * components. This should be coalesced.
1108     *
1109     * TODO: This is ugly and maybe inefficient. Would we rather
1110     * introduce a TRIM.i32 pseudoinstruction?
1111     */
1112    if (nr < nir_intrinsic_src_components(instr, 0)) {
1113       assert(T_size == 32 && "todo: 16-bit trim");
1114 
1115       bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
1116       unsigned src_comps = nir_intrinsic_src_components(instr, 0);
1117 
1118       bi_emit_split_i32(b, chans, data, src_comps);
1119 
1120       bi_index tmp = bi_temp(b->shader);
1121       bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
1122 
1123       bi_foreach_src(collect, w)
1124          collect->src[w] = chans[w];
1125 
1126       data = tmp;
1127    }
1128 
1129    bool psiz =
1130       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1131    bool layer =
1132       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_LAYER);
1133 
1134    bi_index a[4] = {bi_null()};
1135 
1136    if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1137       /* Bifrost position shaders have a fast path */
1138       assert(T == nir_type_float16 || T == nir_type_float32);
1139       unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1140       unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1141       unsigned snap4 = 0x5E;
1142       uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1143 
1144       bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1145                 bi_imm_u32(format), regfmt, nr - 1);
1146    } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1147       bi_index index = bi_preload(b, 59);
1148       unsigned index_offset = 0;
1149       unsigned pos_attr_offset = 0;
1150       unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
1151 
1152       if (psiz || layer)
1153          index_offset += 4;
1154 
1155       if (layer) {
1156          assert(nr == 1 && src_bit_sz == 32);
1157          src_bit_sz = 8;
1158          pos_attr_offset = 2;
1159          data = bi_byte(data, 0);
1160       }
1161 
1162       if (psiz)
1163          assert(T_size == 16 && "should've been lowered");
1164 
1165       bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1166 
1167       if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
1168          unsigned view_index = nir_src_as_uint(instr->src[1]);
1169 
1170          if (varying) {
1171             index_offset += view_index * 4;
1172          } else {
1173             /* We don't patch these offsets in the no_psiz variant, so if
1174              * multiview is enabled we can't switch to the basic format by
1175              * using no_psiz */
1176             bool extended_position_fifo = b->shader->nir->info.outputs_written &
1177                (VARYING_BIT_LAYER | VARYING_BIT_PSIZ);
1178             unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
1179             index_offset += view_index * position_fifo_stride;
1180          }
1181       }
1182 
1183       if (index_offset != 0)
1184          index = bi_iadd_imm_i32(b, index, index_offset);
1185 
1186       /* On Valhall, with IDVS varying are stored in a hardware-controlled
1187        * buffer through table 61 at index 0 */
1188       bi_index address = bi_temp(b->shader);
1189       bi_instr *I = bi_lea_buf_imm_to(b, address, index);
1190       I->table = va_res_fold_table_idx(61);
1191       I->index = 0;
1192       bi_emit_split_i32(b, a, address, 2);
1193 
1194       bi_store(b, nr * src_bit_sz, data, a[0], a[1],
1195                varying ? BI_SEG_VARY : BI_SEG_POS,
1196                varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
1197    } else if (immediate) {
1198       bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1199                                          regfmt, imm_index);
1200       bi_emit_split_i32(b, a, address, 3);
1201 
1202       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1203    } else {
1204       bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1205                                  bi_imm_u32(nir_intrinsic_base(instr)), false);
1206       bi_index address =
1207          bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1208       bi_emit_split_i32(b, a, address, 3);
1209 
1210       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1211    }
1212 }
1213 
1214 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1215 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1216 {
1217    nir_src *offset = nir_get_io_offset_src(instr);
1218 
1219    bool offset_is_const = nir_src_is_const(*offset);
1220    bi_index dyn_offset = bi_src_index(offset);
1221    uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1222 
1223    bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1224                   bi_def_index(&instr->def),
1225                   offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1226                   bi_src_index(&instr->src[0]));
1227 }
1228 
1229 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1230 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1231 {
1232    assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1233 
1234    nir_src *offset = &instr->src[0];
1235    assert(!nir_intrinsic_base(instr) && "base must be zero");
1236    assert(!nir_intrinsic_range(instr) && "range must be zero");
1237    assert(nir_src_is_const(*offset) && "no indirect push constants");
1238    uint32_t base = nir_src_as_uint(*offset);
1239    assert((base & 3) == 0 && "unaligned push constants");
1240 
1241    unsigned bits = instr->def.bit_size * instr->def.num_components;
1242 
1243    unsigned n = DIV_ROUND_UP(bits, 32);
1244    assert(n <= 4);
1245    bi_index channels[4] = {bi_null()};
1246 
1247    for (unsigned i = 0; i < n; ++i) {
1248       unsigned word = (base >> 2) + i;
1249 
1250       channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1251    }
1252 
1253    bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1254 
1255    /* Update push->count to report the highest push constant word being accessed
1256     * by this shader.
1257     */
1258    b->shader->info.push->count =
1259       MAX2((base / 4) + n, b->shader->info.push->count);
1260 }
1261 
1262 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1263 bi_addr_high(bi_builder *b, nir_src *src)
1264 {
1265    return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1266                                          : bi_zero();
1267 }
1268 
1269 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1270 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1271                   enum bi_seg seg, int16_t *offset)
1272 {
1273    /* Not needed on Bifrost or for global accesses */
1274    if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1275       return;
1276 
1277    /* There is no segment modifier on Valhall. Instead, we need to
1278     * emit the arithmetic ourselves. We do have an offset
1279     * available, which saves an instruction for constant offsets.
1280     */
1281    bool wls = (seg == BI_SEG_WLS);
1282    assert(wls || (seg == BI_SEG_TL));
1283 
1284    enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1285 
1286    bi_index base_lo = bi_fau(fau, false);
1287 
1288    if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1289        addr_lo->value == (int16_t)addr_lo->value) {
1290       *offset = addr_lo->value;
1291       *addr_lo = base_lo;
1292    } else {
1293       *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1294    }
1295 
1296    /* Do not allow overflow for WLS or TLS */
1297    *addr_hi = bi_fau(fau, true);
1298 }
1299 
1300 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1301 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1302 {
1303    int16_t offset = 0;
1304    unsigned bits = instr->num_components * instr->def.bit_size;
1305    bi_index dest = bi_def_index(&instr->def);
1306    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1307    bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1308 
1309    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1310 
1311    bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1312    bi_emit_cached_split(b, dest, bits);
1313 }
1314 
1315 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1316 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1317 {
1318    /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1319    assert(nir_intrinsic_write_mask(instr) ==
1320           BITFIELD_MASK(instr->num_components));
1321 
1322    int16_t offset = 0;
1323    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1324    bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1325 
1326    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1327 
1328    bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1329             bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1330 }
1331 
1332 /* Exchanges the staging register with memory */
1333 
1334 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1335 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1336                  enum bi_seg seg)
1337 {
1338    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1339 
1340    unsigned sz = nir_src_bit_size(*arg);
1341    assert(sz == 32 || sz == 64);
1342 
1343    bi_index data = bi_src_index(arg);
1344 
1345    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1346 
1347    if (b->shader->arch >= 9)
1348       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1349    else if (seg == BI_SEG_WLS)
1350       addr_hi = bi_zero();
1351 
1352    bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1353 }
1354 
1355 /* Exchanges the second staging register with memory if comparison with first
1356  * staging register passes */
1357 
1358 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1359 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1360                     nir_src *arg_2, enum bi_seg seg)
1361 {
1362    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1363 
1364    /* hardware is swapped from NIR */
1365    bi_index src0 = bi_src_index(arg_2);
1366    bi_index src1 = bi_src_index(arg_1);
1367 
1368    unsigned sz = nir_src_bit_size(*arg_1);
1369    assert(sz == 32 || sz == 64);
1370 
1371    bi_index data_words[] = {
1372       bi_extract(b, src0, 0),
1373       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1374 
1375       /* 64-bit */
1376       bi_extract(b, src1, 0),
1377       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1378    };
1379 
1380    bi_index in = bi_temp(b->shader);
1381    bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1382    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1383 
1384    if (b->shader->arch >= 9)
1385       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1386    else if (seg == BI_SEG_WLS)
1387       addr_hi = bi_zero();
1388 
1389    bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1390    bi_emit_cached_split(b, out, sz);
1391 
1392    bi_index inout_words[] = {bi_extract(b, out, 0),
1393                              sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1394 
1395    bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1396 }
1397 
1398 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1399 bi_atom_opc_for_nir(nir_atomic_op op)
1400 {
1401    /* clang-format off */
1402    switch (op) {
1403    case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1404    case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1405    case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1406    case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1407    case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1408    case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1409    case nir_atomic_op_ior:  return BI_ATOM_OPC_AOR;
1410    case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1411    default: unreachable("Unexpected computational atomic");
1412    }
1413    /* clang-format on */
1414 }
1415 
1416 /* Optimized unary atomics are available with an implied #1 argument */
1417 
1418 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1419 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1420 {
1421    /* Check we have a compatible constant */
1422    if (arg.type != BI_INDEX_CONSTANT)
1423       return false;
1424 
1425    if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1426       return false;
1427 
1428    /* Check for a compatible operation */
1429    switch (op) {
1430    case BI_ATOM_OPC_AADD:
1431       *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1432       return true;
1433    case BI_ATOM_OPC_ASMAX:
1434       *out = BI_ATOM_OPC_ASMAX1;
1435       return true;
1436    case BI_ATOM_OPC_AUMAX:
1437       *out = BI_ATOM_OPC_AUMAX1;
1438       return true;
1439    case BI_ATOM_OPC_AOR:
1440       *out = BI_ATOM_OPC_AOR1;
1441       return true;
1442    default:
1443       return false;
1444    }
1445 }
1446 
1447 /*
1448  * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1449  * translate between these forms (with MKVEC.v2i16).
1450  *
1451  * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1452  * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
1453  *
1454  * Our indexing needs to reflects this. Since Valhall and Bifrost are quite
1455  * different, we provide separate functions for these.
1456  */
1457 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1458 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1459                     unsigned coord_comps, bool is_array, bool is_msaa)
1460 {
1461    assert(coord_comps > 0 && coord_comps <= 3);
1462 
1463    /* MSAA load store should have been lowered */
1464    assert(!is_msaa);
1465    if (src_idx == 0) {
1466       if (coord_comps == 1 || (coord_comps == 2 && is_array))
1467          return bi_extract(b, coord, 0);
1468       else
1469          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1470                                bi_half(bi_extract(b, coord, 1), false));
1471    } else {
1472       if (coord_comps == 3)
1473          return bi_extract(b, coord, 2);
1474       else if (coord_comps == 2 && is_array)
1475          return bi_extract(b, coord, 1);
1476       else
1477          return bi_zero();
1478    }
1479 }
1480 
1481 static bi_index
va_emit_image_coord(bi_builder * b,bi_index coord,bi_index sample_index,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1482 va_emit_image_coord(bi_builder *b, bi_index coord, bi_index sample_index,
1483                     unsigned src_idx, unsigned coord_comps, bool is_array,
1484                     bool is_msaa)
1485 {
1486    assert(coord_comps > 0 && coord_comps <= 3);
1487    if (src_idx == 0) {
1488       if (coord_comps == 1 || (coord_comps == 2 && is_array))
1489          return bi_extract(b, coord, 0);
1490       else
1491          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1492                                bi_half(bi_extract(b, coord, 1), false));
1493    } else if (is_msaa) {
1494       bi_index array_idx = bi_extract(b, sample_index, 0);
1495       if (coord_comps == 3)
1496          return bi_mkvec_v2i16(b, bi_half(array_idx, false),
1497                                bi_half(bi_extract(b, coord, 2), false));
1498       else if (coord_comps == 2)
1499          return array_idx;
1500    } else if (coord_comps == 3 && is_array) {
1501       return bi_mkvec_v2i16(b, bi_imm_u16(0),
1502                             bi_half(bi_extract(b, coord, 2), false));
1503    } else if (coord_comps == 3 && !is_array) {
1504       return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 2), false),
1505                             bi_imm_u16(0));
1506    } else if (coord_comps == 2 && is_array) {
1507       return bi_mkvec_v2i16(b, bi_imm_u16(0),
1508                             bi_half(bi_extract(b, coord, 1), false));
1509    }
1510    return bi_zero();
1511 }
1512 
1513 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1514 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1515 {
1516    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1517    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1518    bool array =
1519       nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;
1520 
1521    bi_index coords = bi_src_index(&instr->src[1]);
1522    bi_index indexvar = bi_src_index(&instr->src[2]);
1523    bi_index xy, zw;
1524    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS);
1525    if (b->shader->arch < 9) {
1526       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1527       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1528    } else {
1529       xy =
1530          va_emit_image_coord(b, coords, indexvar, 0, coord_comps, array, is_ms);
1531       zw =
1532          va_emit_image_coord(b, coords, indexvar, 1, coord_comps, array, is_ms);
1533    }
1534    bi_index dest = bi_def_index(&instr->def);
1535    enum bi_register_format regfmt =
1536       bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1537    enum bi_vecsize vecsize = instr->num_components - 1;
1538 
1539    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1540       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1541       const unsigned table_index = pan_res_handle_get_table(raw_value);
1542       const unsigned texture_index = pan_res_handle_get_index(raw_value);
1543 
1544       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1545          bi_instr *I =
1546             bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, texture_index);
1547          I->table = va_res_fold_table_idx(table_index);
1548       } else {
1549          bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1550                       vecsize);
1551       }
1552    } else if (b->shader->arch >= 9) {
1553       bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1554                    vecsize);
1555    } else {
1556       bi_ld_attr_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1557                         vecsize);
1558    }
1559 
1560    bi_split_def(b, &instr->def);
1561 }
1562 
1563 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1564 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1565 {
1566    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1567    bool array =
1568       nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;
1569    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1570 
1571    enum bi_register_format type =
1572       (instr->intrinsic == nir_intrinsic_image_store)
1573          ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1574          : BI_REGISTER_FORMAT_AUTO;
1575 
1576    bi_index coords = bi_src_index(&instr->src[1]);
1577    bi_index indices = bi_src_index(&instr->src[2]);
1578    bi_index xy, zw;
1579    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1580    if (b->shader->arch < 9) {
1581       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1582       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1583    } else {
1584       xy =
1585          va_emit_image_coord(b, coords, indices, 0, coord_comps, array, is_ms);
1586       zw =
1587          va_emit_image_coord(b, coords, indices, 1, coord_comps, array, is_ms);
1588    }
1589 
1590    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1591       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1592       unsigned table_index = pan_res_handle_get_table(raw_value);
1593       unsigned texture_index = pan_res_handle_get_index(raw_value);
1594 
1595       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1596          bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, texture_index);
1597          I->table = va_res_fold_table_idx(table_index);
1598       } else {
1599          bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1600       }
1601    } else if (b->shader->arch >= 9) {
1602       bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1603    } else {
1604       bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1605                                        bi_src_index(&instr->src[0]), type);
1606 
1607       /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1608        * our ABI has all images in the primary attribute table
1609        */
1610       I->table = BI_TABLE_ATTRIBUTE_1;
1611    }
1612 
1613    bi_emit_cached_split(b, dest, 3 * 32);
1614 }
1615 
1616 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1617 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1618 {
1619    bi_index dest = bi_temp(b->shader);
1620    bi_emit_lea_image_to(b, dest, instr);
1621    return dest;
1622 }
1623 
1624 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1625 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1626 {
1627    bi_index a[4] = {bi_null()};
1628    bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1629 
1630    /* Due to SPIR-V limitations, the source type is not fully reliable: it
1631     * reports uint32 even for write_imagei. This causes an incorrect
1632     * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1633     * instead, which will match per the OpenCL spec. Of course this does
1634     * not work for 16-bit stores, but those are not available in OpenCL.
1635     */
1636    nir_alu_type T = nir_intrinsic_src_type(instr);
1637    assert(nir_alu_type_get_type_size(T) == 32);
1638 
1639    bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1640              BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1641 }
1642 
1643 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1644 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1645                       nir_atomic_op op)
1646 {
1647    enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1648    enum bi_atom_opc post_opc = opc;
1649    bool bifrost = b->shader->arch <= 8;
1650 
1651    /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1652     * take any vector but can still output in RETURN mode */
1653    bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1654    unsigned sr_count = bifrost ? 2 : 1;
1655 
1656    /* Generate either ATOM or ATOM1 as required */
1657    if (bi_promote_atom_c1(opc, arg, &opc)) {
1658       bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1659                              bi_extract(b, addr, 1), opc, sr_count);
1660    } else {
1661       bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1662                             bi_extract(b, addr, 1), opc, sr_count);
1663    }
1664 
1665    if (bifrost) {
1666       /* Post-process it */
1667       bi_emit_cached_split_i32(b, tmp_dest, 2);
1668       bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1669                           bi_extract(b, tmp_dest, 1), post_opc);
1670    }
1671 }
1672 
1673 static void
bi_emit_load_frag_coord_zw_pan(bi_builder * b,nir_intrinsic_instr * instr)1674 bi_emit_load_frag_coord_zw_pan(bi_builder *b, nir_intrinsic_instr *instr)
1675 {
1676    bi_index dst = bi_def_index(&instr->def);
1677    unsigned channel = nir_intrinsic_component(instr);
1678    nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
1679 
1680    enum bi_sample sample = bi_interp_for_intrinsic(bary->intrinsic);
1681    bi_index src0 = bi_varying_src0_for_barycentric(b, bary);
1682 
1683    /* .explicit is not supported with frag_z */
1684    if (channel == 2)
1685       assert(sample != BI_SAMPLE_EXPLICIT);
1686 
1687    bi_ld_var_special_to(
1688       b, dst, src0, BI_REGISTER_FORMAT_F32, sample, BI_UPDATE_CLOBBER,
1689       (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1690       BI_VECSIZE_NONE);
1691 }
1692 
1693 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1694 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1695 {
1696    bi_index dest = bi_def_index(&instr->def);
1697    nir_alu_type T = nir_intrinsic_dest_type(instr);
1698    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1699    unsigned size = instr->def.bit_size;
1700    unsigned nr = instr->num_components;
1701 
1702    /* Get the render target */
1703    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1704    unsigned loc = sem.location;
1705    assert(loc >= FRAG_RESULT_DATA0);
1706    unsigned rt = (loc - FRAG_RESULT_DATA0);
1707 
1708    bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1709                  bi_src_index(&instr->src[0]), regfmt, nr - 1);
1710    bi_emit_cached_split(b, dest, size * nr);
1711 }
1712 
1713 /*
1714  * Older Bifrost hardware has a limited CLPER instruction. Add a safe helper
1715  * that uses the hardware functionality if available and lowers otherwise.
1716  */
1717 static bi_index
bi_clper(bi_builder * b,bi_index s0,bi_index s1,enum bi_lane_op lop)1718 bi_clper(bi_builder *b, bi_index s0, bi_index s1, enum bi_lane_op lop)
1719 {
1720    if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
1721       if (lop == BI_LANE_OP_XOR) {
1722          bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1723          s1 = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1724       } else {
1725          assert(lop == BI_LANE_OP_NONE);
1726       }
1727 
1728       return bi_clper_old_i32(b, s0, s1);
1729    } else {
1730       return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, lop,
1731                           BI_SUBGROUP_SUBGROUP4);
1732    }
1733 }
1734 
1735 static void
bi_emit_derivative(bi_builder * b,bi_index dst,nir_intrinsic_instr * instr,unsigned axis,bool coarse)1736 bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
1737                    unsigned axis, bool coarse)
1738 {
1739    bi_index left, right;
1740    bi_index s0 = bi_src_index(&instr->src[0]);
1741    unsigned sz = instr->def.bit_size;
1742 
1743    /* If all uses are fabs, the sign of the derivative doesn't matter. This is
1744     * inherently based on fine derivatives so we can't do it for coarse.
1745     */
1746    if (nir_def_all_uses_ignore_sign_bit(&instr->def) && !coarse) {
1747       left = s0;
1748       right = bi_clper(b, s0, bi_imm_u8(axis), BI_LANE_OP_XOR);
1749    } else {
1750       bi_index lane1, lane2;
1751       if (coarse) {
1752          lane1 = bi_imm_u32(0);
1753          lane2 = bi_imm_u32(axis);
1754       } else {
1755          lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
1756                                    bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
1757 
1758          lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
1759       }
1760 
1761       left = bi_clper(b, s0, bi_byte(lane1, 0), BI_LANE_OP_NONE);
1762       right = bi_clper(b, s0, bi_byte(lane2, 0), BI_LANE_OP_NONE);
1763    }
1764 
1765    bi_fadd_to(b, sz, dst, right, bi_neg(left));
1766 }
1767 
1768 static enum bi_subgroup
bi_subgroup_from_cluster_size(unsigned cluster_size)1769 bi_subgroup_from_cluster_size(unsigned cluster_size)
1770 {
1771    switch (cluster_size) {
1772    case 2: return BI_SUBGROUP_SUBGROUP2;
1773    case 4: return BI_SUBGROUP_SUBGROUP4;
1774    case 8: return BI_SUBGROUP_SUBGROUP8;
1775    case 16: return BI_SUBGROUP_SUBGROUP16;
1776    default: unreachable("Unsupported cluster size");
1777    }
1778 }
1779 
1780 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1781 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1782 {
1783    bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1784                      ? bi_def_index(&instr->def)
1785                      : bi_null();
1786    gl_shader_stage stage = b->shader->stage;
1787 
1788    switch (instr->intrinsic) {
1789    case nir_intrinsic_load_barycentric_pixel:
1790    case nir_intrinsic_load_barycentric_centroid:
1791    case nir_intrinsic_load_barycentric_sample:
1792    case nir_intrinsic_load_barycentric_at_sample:
1793    case nir_intrinsic_load_barycentric_at_offset:
1794       /* handled later via load_vary */
1795       break;
1796    case nir_intrinsic_load_attribute_pan:
1797       assert(stage == MESA_SHADER_VERTEX);
1798       bi_emit_load_attr(b, instr);
1799       break;
1800 
1801    case nir_intrinsic_load_interpolated_input:
1802    case nir_intrinsic_load_input:
1803       if (b->shader->inputs->is_blend)
1804          bi_emit_load_blend_input(b, instr);
1805       else if (stage == MESA_SHADER_FRAGMENT)
1806          bi_emit_load_vary(b, instr);
1807       else if (stage == MESA_SHADER_VERTEX)
1808          bi_emit_load_attr(b, instr);
1809       else
1810          unreachable("Unsupported shader stage");
1811       break;
1812 
1813    case nir_intrinsic_store_output:
1814    case nir_intrinsic_store_per_view_output:
1815       if (stage == MESA_SHADER_FRAGMENT)
1816          bi_emit_fragment_out(b, instr);
1817       else if (stage == MESA_SHADER_VERTEX)
1818          bi_emit_store_vary(b, instr);
1819       else
1820          unreachable("Unsupported shader stage");
1821       break;
1822 
1823    case nir_intrinsic_store_combined_output_pan:
1824       assert(stage == MESA_SHADER_FRAGMENT);
1825       bi_emit_fragment_out(b, instr);
1826       break;
1827 
1828    case nir_intrinsic_load_ubo:
1829       bi_emit_load_ubo(b, instr);
1830       break;
1831 
1832    case nir_intrinsic_load_push_constant:
1833       bi_emit_load_push_constant(b, instr);
1834       break;
1835 
1836    case nir_intrinsic_load_global:
1837    case nir_intrinsic_load_global_constant:
1838       bi_emit_load(b, instr, BI_SEG_NONE);
1839       break;
1840 
1841    case nir_intrinsic_store_global:
1842       bi_emit_store(b, instr, BI_SEG_NONE);
1843       break;
1844 
1845    case nir_intrinsic_load_scratch:
1846       bi_emit_load(b, instr, BI_SEG_TL);
1847       break;
1848 
1849    case nir_intrinsic_store_scratch:
1850       bi_emit_store(b, instr, BI_SEG_TL);
1851       break;
1852 
1853    case nir_intrinsic_load_shared:
1854       bi_emit_load(b, instr, BI_SEG_WLS);
1855       break;
1856 
1857    case nir_intrinsic_store_shared:
1858       bi_emit_store(b, instr, BI_SEG_WLS);
1859       break;
1860 
1861    case nir_intrinsic_barrier:
1862       switch (nir_intrinsic_execution_scope(instr)) {
1863       case SCOPE_NONE:
1864          /*
1865           * No execution barrier, and we don't have to do anything for memory
1866           * barriers (see SCOPE_WORKGROUP case.)
1867           */
1868          break;
1869 
1870       case SCOPE_SUBGROUP:
1871          /*
1872           * To implement a subgroup barrier, we only need to prevent the
1873           * scheduler from reordering memory operations around the barrier.
1874           * Avail and vis are trivially established.
1875           */
1876          bi_memory_barrier(b);
1877          break;
1878 
1879       case SCOPE_WORKGROUP:
1880          assert(b->shader->stage == MESA_SHADER_COMPUTE);
1881          bi_barrier(b);
1882          /*
1883           * Blob doesn't seem to do anything for memory barriers, so no need to
1884           * check nir_intrinsic_memory_scope().
1885           */
1886          break;
1887 
1888       default:
1889          unreachable("Unsupported barrier scope");
1890       }
1891 
1892       break;
1893 
1894    case nir_intrinsic_shared_atomic: {
1895       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1896 
1897       if (op == nir_atomic_op_xchg) {
1898          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1899                           BI_SEG_WLS);
1900       } else {
1901          assert(nir_src_bit_size(instr->src[1]) == 32);
1902 
1903          bi_index addr = bi_src_index(&instr->src[0]);
1904          bi_index addr_hi;
1905 
1906          if (b->shader->arch >= 9) {
1907             bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1908             addr = bi_collect_v2i32(b, addr, addr_hi);
1909          } else {
1910             addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1911             bi_emit_cached_split(b, addr, 64);
1912          }
1913 
1914          bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1915       }
1916 
1917       bi_split_def(b, &instr->def);
1918       break;
1919    }
1920 
1921    case nir_intrinsic_global_atomic: {
1922       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1923 
1924       if (op == nir_atomic_op_xchg) {
1925          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1926                           BI_SEG_NONE);
1927       } else {
1928          assert(nir_src_bit_size(instr->src[1]) == 32);
1929 
1930          bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1931                                bi_src_index(&instr->src[1]), op);
1932       }
1933 
1934       bi_split_def(b, &instr->def);
1935       break;
1936    }
1937 
1938    case nir_intrinsic_image_texel_address:
1939       bi_emit_lea_image_to(b, dst, instr);
1940       break;
1941 
1942    case nir_intrinsic_image_load:
1943       bi_emit_image_load(b, instr);
1944       break;
1945 
1946    case nir_intrinsic_image_store:
1947       bi_emit_image_store(b, instr);
1948       break;
1949 
1950    case nir_intrinsic_global_atomic_swap:
1951       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1952                           &instr->src[2], BI_SEG_NONE);
1953       bi_split_def(b, &instr->def);
1954       break;
1955 
1956    case nir_intrinsic_shared_atomic_swap:
1957       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1958                           &instr->src[2], BI_SEG_WLS);
1959       bi_split_def(b, &instr->def);
1960       break;
1961 
1962    case nir_intrinsic_load_pixel_coord:
1963       /* Vectorized load of the preloaded i16vec2 */
1964       bi_mov_i32_to(b, dst, bi_preload(b, 59));
1965       break;
1966 
1967    case nir_intrinsic_load_frag_coord_zw_pan:
1968       bi_emit_load_frag_coord_zw_pan(b, instr);
1969       break;
1970 
1971    case nir_intrinsic_load_converted_output_pan:
1972       bi_emit_ld_tile(b, instr);
1973       break;
1974 
1975    case nir_intrinsic_terminate_if:
1976       bi_discard_b32(b, bi_src_index(&instr->src[0]));
1977       break;
1978 
1979    case nir_intrinsic_terminate:
1980       bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1981       break;
1982 
1983    case nir_intrinsic_load_sample_positions_pan:
1984       bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1985                           bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1986       break;
1987 
1988    case nir_intrinsic_load_sample_mask_in:
1989       /* r61[0:15] contains the coverage bitmap */
1990       bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1991       break;
1992 
1993    case nir_intrinsic_load_sample_mask:
1994       bi_mov_i32_to(b, dst, bi_coverage(b));
1995       break;
1996 
1997    case nir_intrinsic_load_sample_id:
1998       bi_load_sample_id_to(b, dst);
1999       break;
2000 
2001    case nir_intrinsic_load_front_face:
2002       /* r58 == 0 means primitive is front facing */
2003       bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
2004                      BI_RESULT_TYPE_M1);
2005       break;
2006 
2007    case nir_intrinsic_load_point_coord:
2008       bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
2009                            BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
2010                            BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
2011       bi_emit_cached_split_i32(b, dst, 2);
2012       break;
2013 
2014    /* It appears vertex_id is zero-based with Bifrost geometry flows, but
2015     * not with Valhall's memory-allocation IDVS geometry flow. We only support
2016     * the new flow on Valhall so this is lowered in NIR.
2017     */
2018    case nir_intrinsic_load_vertex_id:
2019       assert(b->shader->malloc_idvs);
2020       bi_mov_i32_to(b, dst, bi_vertex_id(b));
2021       break;
2022 
2023    case nir_intrinsic_load_raw_vertex_id_pan:
2024       assert(!b->shader->malloc_idvs);
2025       bi_mov_i32_to(b, dst, bi_vertex_id(b));
2026       break;
2027 
2028    case nir_intrinsic_load_instance_id:
2029       bi_mov_i32_to(b, dst, bi_instance_id(b));
2030       break;
2031 
2032    case nir_intrinsic_load_draw_id:
2033       bi_mov_i32_to(b, dst, bi_draw_id(b));
2034       break;
2035 
2036    case nir_intrinsic_load_subgroup_invocation:
2037       bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
2038       break;
2039 
2040    case nir_intrinsic_ballot:
2041    case nir_intrinsic_ballot_relaxed: {
2042       enum bi_subgroup subgroup =
2043          bi_subgroup_from_cluster_size(pan_subgroup_size(b->shader->arch));
2044       bi_wmask_to(b, dst, bi_src_index(&instr->src[0]), subgroup, 0);
2045       break;
2046    }
2047 
2048    case nir_intrinsic_read_invocation: {
2049       enum bi_inactive_result inactive_result = BI_INACTIVE_RESULT_ZERO;
2050       enum bi_lane_op lane_op = BI_LANE_OP_NONE;
2051       enum bi_subgroup subgroup =
2052          bi_subgroup_from_cluster_size(pan_subgroup_size(b->shader->arch));
2053       bi_clper_i32_to(b, dst,
2054                       bi_src_index(&instr->src[0]),
2055                       bi_byte(bi_src_index(&instr->src[1]), 0),
2056                       inactive_result, lane_op, subgroup);
2057       break;
2058    }
2059 
2060    case nir_intrinsic_load_local_invocation_id:
2061       bi_collect_v3i32_to(b, dst,
2062                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
2063                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
2064                           bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
2065       break;
2066 
2067    case nir_intrinsic_load_workgroup_id:
2068       bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
2069                           bi_preload(b, 59));
2070       break;
2071 
2072    case nir_intrinsic_load_global_invocation_id:
2073       bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
2074                           bi_preload(b, 62));
2075       break;
2076 
2077    case nir_intrinsic_shader_clock:
2078       bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
2079       bi_split_def(b, &instr->def);
2080       break;
2081 
2082    case nir_intrinsic_ddx:
2083    case nir_intrinsic_ddx_fine:
2084       bi_emit_derivative(b, dst, instr, 1, false);
2085       break;
2086    case nir_intrinsic_ddx_coarse:
2087       bi_emit_derivative(b, dst, instr, 1, true);
2088       break;
2089    case nir_intrinsic_ddy:
2090    case nir_intrinsic_ddy_fine:
2091       bi_emit_derivative(b, dst, instr, 2, false);
2092       break;
2093    case nir_intrinsic_ddy_coarse:
2094       bi_emit_derivative(b, dst, instr, 2, true);
2095       break;
2096 
2097    case nir_intrinsic_load_view_index:
2098    case nir_intrinsic_load_layer_id:
2099       assert(b->shader->arch >= 9);
2100       bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
2101       break;
2102 
2103    case nir_intrinsic_load_ssbo_address:
2104       assert(b->shader->arch >= 9);
2105       bi_lea_buffer_to(b, dst, bi_src_index(&instr->src[1]),
2106                        bi_src_index(&instr->src[0]));
2107       bi_emit_cached_split(b, dst, 64);
2108       break;
2109 
2110    case nir_intrinsic_load_ssbo: {
2111       assert(b->shader->arch >= 9);
2112       unsigned dst_bits = instr->num_components * instr->def.bit_size;
2113       bi_ld_buffer_to(b, dst_bits, dst, bi_src_index(&instr->src[1]),
2114                       bi_src_index(&instr->src[0]));
2115       bi_emit_cached_split(b, dst, dst_bits);
2116       break;
2117    }
2118 
2119    case nir_intrinsic_as_uniform:
2120       /*
2121        * We don't have uniform registers (registers shared by all threads in the
2122        * warp) like some other hardware does so this is just a simple mov for
2123        * us.
2124        */
2125       bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0]));
2126       break;
2127 
2128    default:
2129       fprintf(stderr, "Unhandled intrinsic %s\n",
2130               nir_intrinsic_infos[instr->intrinsic].name);
2131       assert(0);
2132    }
2133 }
2134 
2135 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)2136 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
2137 {
2138    /* Make sure we've been lowered */
2139    assert(instr->def.num_components <= (32 / instr->def.bit_size));
2140 
2141    /* Accumulate all the channels of the constant, as if we did an
2142     * implicit SEL over them */
2143    uint32_t acc = 0;
2144 
2145    for (unsigned i = 0; i < instr->def.num_components; ++i) {
2146       unsigned v =
2147          nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
2148       acc |= (v << (i * instr->def.bit_size));
2149    }
2150 
2151    bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
2152 }
2153 
2154 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)2155 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
2156 {
2157    unsigned bitsize = nir_src_bit_size(src.src);
2158 
2159    /* the bi_index carries the 32-bit (word) offset separate from the
2160     * subword swizzle, first handle the offset */
2161 
2162    unsigned offset = 0;
2163 
2164    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
2165    unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
2166 
2167    for (unsigned i = 0; i < comps; ++i) {
2168       unsigned new_offset = (src.swizzle[i] >> subword_shift);
2169 
2170       if (i > 0)
2171          assert(offset == new_offset && "wrong vectorization");
2172 
2173       offset = new_offset;
2174    }
2175 
2176    bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
2177 
2178    /* Compose the subword swizzle with existing (identity) swizzle */
2179    assert(idx.swizzle == BI_SWIZZLE_H01);
2180 
2181    /* Bigger vectors should have been lowered */
2182    assert(comps <= (1 << subword_shift));
2183 
2184    if (bitsize == 16) {
2185       unsigned c0 = src.swizzle[0] & 1;
2186       unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
2187       idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
2188    } else if (bitsize == 8 && comps == 1) {
2189       idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
2190    } else if (bitsize == 8) {
2191       /* XXX: Use optimized swizzle when posisble */
2192       bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
2193       unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};
2194 
2195       for (unsigned i = 0; i < comps; ++i) {
2196          unoffset_srcs[i] = bi_src_index(&src.src);
2197          channels[i] = src.swizzle[i];
2198       }
2199 
2200       bi_index temp = bi_temp(b->shader);
2201       bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);
2202 
2203       static const enum bi_swizzle swizzle_lut[] = {
2204          BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_H01, BI_SWIZZLE_H01};
2205       assert(comps - 1 < ARRAY_SIZE(swizzle_lut));
2206 
2207       /* Assign a coherent swizzle for the vector */
2208       temp.swizzle = swizzle_lut[comps - 1];
2209 
2210       return temp;
2211    }
2212 
2213    return idx;
2214 }
2215 
2216 static enum bi_round
bi_nir_round(nir_op op)2217 bi_nir_round(nir_op op)
2218 {
2219    switch (op) {
2220    case nir_op_fround_even:
2221       return BI_ROUND_NONE;
2222    case nir_op_ftrunc:
2223       return BI_ROUND_RTZ;
2224    case nir_op_fceil:
2225       return BI_ROUND_RTP;
2226    case nir_op_ffloor:
2227       return BI_ROUND_RTN;
2228    default:
2229       unreachable("invalid nir round op");
2230    }
2231 }
2232 
2233 /* Convenience for lowered transcendentals */
2234 
2235 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)2236 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2237 {
2238    return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2239 }
2240 
2241 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2242  * Newton-Raphson to improve precision */
2243 
2244 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)2245 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2246 {
2247    bi_index x1 = bi_frcp_approx_f32(b, s0);
2248    bi_index m = bi_frexpm_f32(b, s0, false, false);
2249    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
2250    bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
2251                                    BI_SPECIAL_N);
2252    bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2253 }
2254 
2255 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)2256 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2257 {
2258    bi_index x1 = bi_frsq_approx_f32(b, s0);
2259    bi_index m = bi_frexpm_f32(b, s0, false, true);
2260    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
2261    bi_index t1 = bi_fmul_f32(b, x1, x1);
2262    bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2263                                    bi_imm_u32(-1), BI_SPECIAL_N);
2264    bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2265 }
2266 
2267 /* More complex transcendentals, see
2268  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2269  * for documentation */
2270 
2271 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)2272 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2273 {
2274    bi_index t1 = bi_temp(b->shader);
2275    bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2276    t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2277 
2278    bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2279 
2280    bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2281    a2->clamp = BI_CLAMP_CLAMP_M1_1;
2282 
2283    bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2284    bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2285    bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2286    bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2287                             bi_imm_u32(0x3e75fffa));
2288    bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2289    bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2290    bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
2291                                       BI_SPECIAL_NONE);
2292    x->clamp = BI_CLAMP_CLAMP_0_INF;
2293 
2294    bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2295    max->sem = BI_SEM_NAN_PROPAGATE;
2296 }
2297 
2298 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)2299 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2300 {
2301    /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2302     * fixed-point input */
2303    bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2304                                       bi_imm_u32(24), BI_SPECIAL_NONE);
2305    bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2306    fixed_pt->round = BI_ROUND_NONE; // XXX
2307 
2308    /* Compute the result for the fixed-point input, but pass along
2309     * the floating-point scale for correct NaN propagation */
2310    bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2311 }
2312 
2313 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2314 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2315 {
2316    /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2317    bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2318    bi_index ei = bi_frexpe_f32(b, s0, true, false);
2319    bi_index ef = bi_s32_to_f32(b, ei);
2320 
2321    /* xt estimates -log(r1), a coarse approximation of log(a1) */
2322    bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2323    bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2324 
2325    /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2326     * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2327     * and then log(s0) = x1 + x2 */
2328    bi_index x1 = bi_fadd_f32(b, ef, xt);
2329 
2330    /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2331     * polynomial approximation around 1. The series is expressed around
2332     * 1, so set y = (a1 * r1) - 1.0 */
2333    bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2334 
2335    /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2336     * log_e(1 + y) by the Taylor series (lower precision than the blob):
2337     * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2338    bi_index loge =
2339       bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2340 
2341    bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2342 
2343    /* log(s0) = x1 + x2 */
2344    bi_fadd_f32_to(b, dst, x1, x2);
2345 }
2346 
2347 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2348 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2349 {
2350    bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2351    bi_index frexpi = bi_s32_to_f32(b, frexp);
2352    bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2353    bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2354 }
2355 
2356 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2357 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2358 {
2359    bi_index log2_base = bi_null();
2360 
2361    if (base.type == BI_INDEX_CONSTANT) {
2362       log2_base = bi_imm_f32(log2f(uif(base.value)));
2363    } else {
2364       log2_base = bi_temp(b->shader);
2365       bi_lower_flog2_32(b, log2_base, base);
2366    }
2367 
2368    return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2369 }
2370 
2371 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2372 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2373 {
2374    bi_index log2_base = bi_null();
2375 
2376    if (base.type == BI_INDEX_CONSTANT) {
2377       log2_base = bi_imm_f32(log2f(uif(base.value)));
2378    } else {
2379       log2_base = bi_temp(b->shader);
2380       bi_flog2_32(b, log2_base, base);
2381    }
2382 
2383    return bi_fexp_32(b, dst, exp, log2_base);
2384 }
2385 
2386 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2387  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2388  * calculates the results. We use them to calculate sin/cos via a Taylor
2389  * approximation:
2390  *
2391  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2392  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2393  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2394  */
2395 
2396 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
2397 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2398 #define SINCOS_BIAS  bi_imm_u32(0x49400000)
2399 
2400 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)2401 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2402 {
2403    /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2404    bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2405 
2406    /* Approximate domain error (small) */
2407    bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2408                            MPI_OVER_TWO, s0);
2409 
2410    /* Lookup sin(x), cos(x) */
2411    bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2412    bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2413 
2414    /* e^2 / 2 */
2415    bi_index e2_over_2 =
2416       bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2417 
2418    /* (-e^2)/2 f''(x) */
2419    bi_index quadratic =
2420       bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2421 
2422    /* e f'(x) - (e^2/2) f''(x) */
2423    bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2424                                cos ? bi_neg(sinx) : cosx, quadratic);
2425    I->clamp = BI_CLAMP_CLAMP_M1_1;
2426 
2427    /* f(x) + e f'(x) - (e^2/2) f''(x) */
2428    bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2429 }
2430 
2431 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2432 bi_translate_cmpf(nir_op op)
2433 {
2434    switch (op) {
2435    case nir_op_ieq8:
2436    case nir_op_ieq16:
2437    case nir_op_ieq32:
2438    case nir_op_feq16:
2439    case nir_op_feq32:
2440       return BI_CMPF_EQ;
2441 
2442    case nir_op_ine8:
2443    case nir_op_ine16:
2444    case nir_op_ine32:
2445    case nir_op_fneu16:
2446    case nir_op_fneu32:
2447       return BI_CMPF_NE;
2448 
2449    case nir_op_ilt8:
2450    case nir_op_ilt16:
2451    case nir_op_ilt32:
2452    case nir_op_flt16:
2453    case nir_op_flt32:
2454    case nir_op_ult8:
2455    case nir_op_ult16:
2456    case nir_op_ult32:
2457       return BI_CMPF_LT;
2458 
2459    case nir_op_ige8:
2460    case nir_op_ige16:
2461    case nir_op_ige32:
2462    case nir_op_fge16:
2463    case nir_op_fge32:
2464    case nir_op_uge8:
2465    case nir_op_uge16:
2466    case nir_op_uge32:
2467       return BI_CMPF_GE;
2468 
2469    default:
2470       unreachable("invalid comparison");
2471    }
2472 }
2473 
2474 static bool
bi_nir_is_replicated(nir_alu_src * src)2475 bi_nir_is_replicated(nir_alu_src *src)
2476 {
2477    for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2478       if (src->swizzle[0] != src->swizzle[i])
2479          return false;
2480    }
2481 
2482    return true;
2483 }
2484 
2485 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2486 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2487 {
2488    bi_index dst = bi_def_index(&instr->def);
2489    unsigned srcs = nir_op_infos[instr->op].num_inputs;
2490    unsigned sz = instr->def.bit_size;
2491    unsigned comps = instr->def.num_components;
2492    unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2493 
2494    /* Indicate scalarness */
2495    if (sz == 16 && comps == 1)
2496       dst.swizzle = BI_SWIZZLE_H00;
2497 
2498    /* First, match against the various moves in NIR. These are
2499     * special-cased because they can operate on vectors even after
2500     * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2501     * instruction is no "bigger" than SIMD-within-a-register. These moves
2502     * are the exceptions that need to handle swizzles specially. */
2503 
2504    switch (instr->op) {
2505    case nir_op_vec2:
2506    case nir_op_vec3:
2507    case nir_op_vec4:
2508    case nir_op_vec8:
2509    case nir_op_vec16: {
2510       bi_index unoffset_srcs[16] = {bi_null()};
2511       unsigned channels[16] = {0};
2512 
2513       for (unsigned i = 0; i < srcs; ++i) {
2514          unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2515          channels[i] = instr->src[i].swizzle[0];
2516       }
2517 
2518       bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2519       return;
2520    }
2521 
2522    case nir_op_unpack_32_2x16: {
2523       /* Should have been scalarized */
2524       assert(comps == 2 && sz == 16);
2525 
2526       bi_index vec = bi_src_index(&instr->src[0].src);
2527       unsigned chan = instr->src[0].swizzle[0];
2528 
2529       bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2530       return;
2531    }
2532 
2533    case nir_op_unpack_64_2x32_split_x: {
2534       unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2535       bi_mov_i32_to(b, dst,
2536                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2537       return;
2538    }
2539 
2540    case nir_op_unpack_64_2x32_split_y: {
2541       unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2542       bi_mov_i32_to(b, dst,
2543                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2544       return;
2545    }
2546 
2547    case nir_op_pack_64_2x32_split:
2548       bi_collect_v2i32_to(b, dst,
2549                           bi_extract(b, bi_src_index(&instr->src[0].src),
2550                                      instr->src[0].swizzle[0]),
2551                           bi_extract(b, bi_src_index(&instr->src[1].src),
2552                                      instr->src[1].swizzle[0]));
2553       return;
2554 
2555    case nir_op_pack_64_2x32:
2556       bi_collect_v2i32_to(b, dst,
2557                           bi_extract(b, bi_src_index(&instr->src[0].src),
2558                                      instr->src[0].swizzle[0]),
2559                           bi_extract(b, bi_src_index(&instr->src[0].src),
2560                                      instr->src[0].swizzle[1]));
2561       return;
2562 
2563    case nir_op_pack_uvec2_to_uint: {
2564       bi_index src = bi_src_index(&instr->src[0].src);
2565 
2566       assert(sz == 32 && src_sz == 32);
2567       bi_mkvec_v2i16_to(
2568          b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2569          bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2570       return;
2571    }
2572 
2573    case nir_op_pack_uvec4_to_uint: {
2574       bi_index src = bi_src_index(&instr->src[0].src);
2575 
2576       assert(sz == 32 && src_sz == 32);
2577 
2578       bi_index srcs[4] = {
2579          bi_extract(b, src, instr->src[0].swizzle[0]),
2580          bi_extract(b, src, instr->src[0].swizzle[1]),
2581          bi_extract(b, src, instr->src[0].swizzle[2]),
2582          bi_extract(b, src, instr->src[0].swizzle[3]),
2583       };
2584       unsigned channels[4] = {0};
2585       bi_make_vec_to(b, dst, srcs, channels, 4, 8);
2586       return;
2587    }
2588 
2589    case nir_op_mov: {
2590       bi_index idx = bi_src_index(&instr->src[0].src);
2591       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2592 
2593       unsigned channels[4] = {
2594          comps > 0 ? instr->src[0].swizzle[0] : 0,
2595          comps > 1 ? instr->src[0].swizzle[1] : 0,
2596          comps > 2 ? instr->src[0].swizzle[2] : 0,
2597          comps > 3 ? instr->src[0].swizzle[3] : 0,
2598       };
2599 
2600       bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2601       return;
2602    }
2603 
2604    case nir_op_pack_32_2x16: {
2605       assert(comps == 1);
2606 
2607       bi_index idx = bi_src_index(&instr->src[0].src);
2608       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2609 
2610       unsigned channels[2] = {instr->src[0].swizzle[0],
2611                               instr->src[0].swizzle[1]};
2612 
2613       bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2614       return;
2615    }
2616 
2617    case nir_op_f2f16:
2618    case nir_op_f2f16_rtz:
2619    case nir_op_f2f16_rtne: {
2620       assert(src_sz == 32);
2621       bi_index idx = bi_src_index(&instr->src[0].src);
2622       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2623       bi_index s1 =
2624          comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2625 
2626       bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2627 
2628       /* Override rounding if explicitly requested. Otherwise, the
2629        * default rounding mode is selected by the builder. Depending
2630        * on the float controls required by the shader, the default
2631        * mode may not be nearest-even.
2632        */
2633       if (instr->op == nir_op_f2f16_rtz)
2634          I->round = BI_ROUND_RTZ;
2635       else if (instr->op == nir_op_f2f16_rtne)
2636          I->round = BI_ROUND_NONE; /* Nearest even */
2637 
2638       return;
2639    }
2640 
2641    /* Vectorized downcasts */
2642    case nir_op_u2u16:
2643    case nir_op_i2i16: {
2644       if (!(src_sz == 32 && comps == 2))
2645          break;
2646 
2647       bi_index idx = bi_src_index(&instr->src[0].src);
2648       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2649       bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2650 
2651       bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2652       return;
2653    }
2654 
2655    /* Pre-v11, we can get vector i2f32 by lowering 32-bit vector i2f16 to
2656     * i2f32 + f2f16 in bifrost_nir_lower_algebraic_late, which runs after
2657     * nir_opt_vectorize. We don't scalarize i2f32 earlier because we have
2658     * vector V2F16_TO_V2F32. */
2659    case nir_op_i2f32:
2660    case nir_op_u2f32: {
2661       if (!(src_sz == 32 && comps == 2))
2662          break;
2663 
2664       nir_alu_src *src = &instr->src[0];
2665       bi_index idx = bi_src_index(&src->src);
2666       bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2667       bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2668 
2669       bi_index d0, d1;
2670       if (instr->op == nir_op_i2f32) {
2671          d0 = bi_s32_to_f32(b, s0);
2672          d1 = bi_s32_to_f32(b, s1);
2673       } else {
2674          d0 = bi_u32_to_f32(b, s0);
2675          d1 = bi_u32_to_f32(b, s1);
2676       }
2677 
2678       bi_collect_v2i32_to(b, dst, d0, d1);
2679 
2680       return;
2681    }
2682 
2683    case nir_op_i2i8:
2684    case nir_op_u2u8: {
2685       /* Acts like an 8-bit swizzle */
2686       bi_index idx = bi_src_index(&instr->src[0].src);
2687       unsigned factor = src_sz / 8;
2688       unsigned chan[4] = {0};
2689 
2690       for (unsigned i = 0; i < comps; ++i)
2691          chan[i] = instr->src[0].swizzle[i] * factor;
2692 
2693       bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2694       return;
2695    }
2696 
2697    case nir_op_b32csel: {
2698       if (sz != 16)
2699          break;
2700 
2701       /* We allow vectorizing b32csel(cond, A, B) which can be
2702        * translated as MUX.v2i16, even though cond is a 32-bit vector.
2703        *
2704        * If the source condition vector is replicated, we can use
2705        * MUX.v2i16 directly, letting each component use the
2706        * corresponding half of the 32-bit source. NIR uses 0/~0
2707        * booleans so that's guaranteed to work (that is, 32-bit NIR
2708        * booleans are 16-bit replicated).
2709        *
2710        * If we're not replicated, we use the same trick but must
2711        * insert a MKVEC.v2i16 first to convert down to 16-bit.
2712        */
2713       bi_index idx = bi_src_index(&instr->src[0].src);
2714       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2715       bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2716       bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2717 
2718       if (!bi_nir_is_replicated(&instr->src[0])) {
2719          s0 = bi_mkvec_v2i16(
2720             b, bi_half(s0, false),
2721             bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2722       }
2723 
2724       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2725       return;
2726    }
2727 
2728    default:
2729       break;
2730    }
2731 
2732    bi_index s0 =
2733       srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2734    bi_index s1 =
2735       srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2736    bi_index s2 =
2737       srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2738 
2739    switch (instr->op) {
2740    case nir_op_ffma:
2741       bi_fma_to(b, sz, dst, s0, s1, s2);
2742       break;
2743 
2744    case nir_op_fmul:
2745       bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2746       break;
2747 
2748    case nir_op_fadd:
2749       bi_fadd_to(b, sz, dst, s0, s1);
2750       break;
2751 
2752    case nir_op_fsat: {
2753       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2754       I->clamp = BI_CLAMP_CLAMP_0_1;
2755       break;
2756    }
2757 
2758    case nir_op_fsat_signed: {
2759       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2760       I->clamp = BI_CLAMP_CLAMP_M1_1;
2761       break;
2762    }
2763 
2764    case nir_op_fclamp_pos: {
2765       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2766       I->clamp = BI_CLAMP_CLAMP_0_INF;
2767       break;
2768    }
2769 
2770    case nir_op_fneg:
2771       bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2772       break;
2773 
2774    case nir_op_fabs:
2775       bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2776       break;
2777 
2778    case nir_op_fsin:
2779       bi_lower_fsincos_32(b, dst, s0, false);
2780       break;
2781 
2782    case nir_op_fcos:
2783       bi_lower_fsincos_32(b, dst, s0, true);
2784       break;
2785 
2786    case nir_op_fexp2:
2787       assert(sz == 32); /* should've been lowered */
2788 
2789       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2790          bi_lower_fexp2_32(b, dst, s0);
2791       else
2792          bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2793 
2794       break;
2795 
2796    case nir_op_flog2:
2797       assert(sz == 32); /* should've been lowered */
2798 
2799       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2800          bi_lower_flog2_32(b, dst, s0);
2801       else
2802          bi_flog2_32(b, dst, s0);
2803 
2804       break;
2805 
2806    case nir_op_fpow:
2807       assert(sz == 32); /* should've been lowered */
2808 
2809       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2810          bi_lower_fpow_32(b, dst, s0, s1);
2811       else
2812          bi_fpow_32(b, dst, s0, s1);
2813 
2814       break;
2815 
2816    case nir_op_frexp_exp:
2817       bi_frexpe_to(b, sz, dst, s0, false, false);
2818       break;
2819 
2820    case nir_op_frexp_sig:
2821       bi_frexpm_to(b, sz, dst, s0, false, false);
2822       break;
2823 
2824    case nir_op_ldexp:
2825       bi_ldexp_to(b, sz, dst, s0, s1);
2826       break;
2827 
2828    case nir_op_b8csel:
2829       bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2830       break;
2831 
2832    case nir_op_b16csel:
2833       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2834       break;
2835 
2836    case nir_op_b32csel:
2837       bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2838       break;
2839 
2840    case nir_op_extract_u8:
2841    case nir_op_extract_i8: {
2842       assert(comps == 1 && "should be scalarized");
2843       assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2844       unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2845 
2846       if (s0.swizzle == BI_SWIZZLE_H11) {
2847          assert(byte < 2);
2848          byte += 2;
2849       } else if (s0.swizzle != BI_SWIZZLE_H01) {
2850          assert(s0.swizzle == BI_SWIZZLE_H00);
2851       }
2852 
2853       assert(byte < 4);
2854 
2855       s0.swizzle = BI_SWIZZLE_H01;
2856 
2857       if (instr->op == nir_op_extract_i8)
2858          bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2859       else
2860          bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2861       break;
2862    }
2863 
2864    case nir_op_extract_u16:
2865    case nir_op_extract_i16: {
2866       assert(comps == 1 && "should be scalarized");
2867       assert(src_sz == 32 && "should be lowered");
2868       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2869       assert(half == 0 || half == 1);
2870 
2871       if (instr->op == nir_op_extract_i16)
2872          bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2873       else
2874          bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2875       break;
2876    }
2877 
2878    case nir_op_insert_u16: {
2879       assert(comps == 1 && "should be scalarized");
2880       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2881       assert(half == 0 || half == 1);
2882 
2883       if (half == 0)
2884          bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2885       else
2886          bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2887       break;
2888    }
2889 
2890    case nir_op_ishl:
2891       bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2892       break;
2893    case nir_op_ushr:
2894       bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2895       break;
2896 
2897    case nir_op_ishr:
2898       if (b->shader->arch >= 9)
2899          bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2900       else
2901          bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2902       break;
2903 
2904    case nir_op_imin:
2905    case nir_op_umin:
2906       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2907                  s1, BI_CMPF_LT);
2908       break;
2909 
2910    case nir_op_imax:
2911    case nir_op_umax:
2912       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2913                  s1, BI_CMPF_GT);
2914       break;
2915 
2916    case nir_op_f2f32:
2917       bi_f16_to_f32_to(b, dst, s0);
2918       break;
2919 
2920    case nir_op_fquantize2f16: {
2921       bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2922 
2923       if (b->shader->arch < 9) {
2924          /* Bifrost has psuedo-ftz on conversions, that is lowered to an ftz
2925           * flag in the clause header */
2926          f16->ftz = true;
2927       } else {
2928          /* Valhall doesn't have clauses, and uses a separate flush
2929           * instruction */
2930          f16 = bi_flush_to(b, 16, bi_temp(b->shader), f16->dest[0]);
2931          f16->ftz = true;
2932       }
2933 
2934       bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2935 
2936       if (b->shader->arch < 9)
2937          f32->ftz = true;
2938 
2939       break;
2940    }
2941 
2942    case nir_op_f2i32:
2943       if (src_sz == 32)
2944          bi_f32_to_s32_to(b, dst, s0);
2945       else
2946          bi_f16_to_s32_to(b, dst, s0);
2947       break;
2948 
2949    /* Note 32-bit sources => no vectorization, so 32-bit works */
2950    case nir_op_f2u16:
2951       if (src_sz == 32)
2952          bi_f32_to_u32_to(b, dst, s0);
2953       else
2954          bi_v2f16_to_v2u16_to(b, dst, s0);
2955       break;
2956 
2957    case nir_op_f2i16:
2958       if (src_sz == 32)
2959          bi_f32_to_s32_to(b, dst, s0);
2960       else
2961          bi_v2f16_to_v2s16_to(b, dst, s0);
2962       break;
2963 
2964    case nir_op_f2u32:
2965       if (src_sz == 32)
2966          bi_f32_to_u32_to(b, dst, s0);
2967       else
2968          bi_f16_to_u32_to(b, dst, s0);
2969       break;
2970 
2971    case nir_op_u2f16:
2972       /* V2I32_TO_V2F16 does not exist */
2973       assert((src_sz == 16 || src_sz == 8) && "should be lowered");
2974 
2975       if (src_sz == 16)
2976          bi_v2u16_to_v2f16_to(b, dst, s0);
2977       else if (src_sz == 8)
2978          bi_v2u8_to_v2f16_to(b, dst, s0);
2979       break;
2980 
2981    case nir_op_u2f32:
2982       if (src_sz == 32)
2983          bi_u32_to_f32_to(b, dst, s0);
2984       else if (src_sz == 16)
2985          bi_u16_to_f32_to(b, dst, s0);
2986       else
2987          bi_u8_to_f32_to(b, dst, s0);
2988       break;
2989 
2990    case nir_op_i2f16:
2991       /* V2I32_TO_V2F16 does not exist */
2992       assert((src_sz == 16 || src_sz == 8) && "should be lowered");
2993 
2994       if (src_sz == 16)
2995          bi_v2s16_to_v2f16_to(b, dst, s0);
2996       else if (src_sz == 8)
2997          bi_v2s8_to_v2f16_to(b, dst, s0);
2998       break;
2999 
3000    case nir_op_i2f32:
3001       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3002 
3003       if (src_sz == 32)
3004          bi_s32_to_f32_to(b, dst, s0);
3005       else if (src_sz == 16)
3006          bi_s16_to_f32_to(b, dst, s0);
3007       else if (src_sz == 8)
3008          bi_s8_to_f32_to(b, dst, s0);
3009       break;
3010 
3011    case nir_op_i2i32:
3012       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3013 
3014       if (src_sz == 32)
3015          bi_mov_i32_to(b, dst, s0);
3016       else if (src_sz == 16)
3017          bi_s16_to_s32_to(b, dst, s0);
3018       else if (src_sz == 8)
3019          bi_s8_to_s32_to(b, dst, s0);
3020       break;
3021 
3022    case nir_op_u2u32:
3023       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3024 
3025       if (src_sz == 32)
3026          bi_mov_i32_to(b, dst, s0);
3027       else if (src_sz == 16)
3028          bi_u16_to_u32_to(b, dst, s0);
3029       else if (src_sz == 8)
3030          bi_u8_to_u32_to(b, dst, s0);
3031 
3032       break;
3033 
3034    case nir_op_i2i16:
3035       assert(src_sz == 8 || src_sz == 32);
3036 
3037       if (src_sz == 8)
3038          bi_v2s8_to_v2s16_to(b, dst, s0);
3039       else
3040          bi_mov_i32_to(b, dst, s0);
3041       break;
3042 
3043    case nir_op_u2u16:
3044       assert(src_sz == 8 || src_sz == 32);
3045 
3046       if (src_sz == 8)
3047          bi_v2u8_to_v2u16_to(b, dst, s0);
3048       else
3049          bi_mov_i32_to(b, dst, s0);
3050       break;
3051 
3052    case nir_op_b2i8:
3053    case nir_op_b2i16:
3054    case nir_op_b2i32:
3055       bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
3056                 BI_MUX_INT_ZERO);
3057       break;
3058 
3059    case nir_op_ieq8:
3060    case nir_op_ine8:
3061    case nir_op_ilt8:
3062    case nir_op_ige8:
3063    case nir_op_ieq16:
3064    case nir_op_ine16:
3065    case nir_op_ilt16:
3066    case nir_op_ige16:
3067    case nir_op_ieq32:
3068    case nir_op_ine32:
3069    case nir_op_ilt32:
3070    case nir_op_ige32:
3071       bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
3072                  BI_RESULT_TYPE_M1);
3073       break;
3074 
3075    case nir_op_ult8:
3076    case nir_op_uge8:
3077    case nir_op_ult16:
3078    case nir_op_uge16:
3079    case nir_op_ult32:
3080    case nir_op_uge32:
3081       bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
3082                  bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
3083       break;
3084 
3085    case nir_op_feq32:
3086    case nir_op_feq16:
3087    case nir_op_flt32:
3088    case nir_op_flt16:
3089    case nir_op_fge32:
3090    case nir_op_fge16:
3091    case nir_op_fneu32:
3092    case nir_op_fneu16:
3093       bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
3094                  BI_RESULT_TYPE_M1);
3095       break;
3096 
3097    case nir_op_fround_even:
3098    case nir_op_fceil:
3099    case nir_op_ffloor:
3100    case nir_op_ftrunc:
3101       bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
3102       break;
3103 
3104    case nir_op_fmin:
3105       bi_fmin_to(b, sz, dst, s0, s1);
3106       break;
3107 
3108    case nir_op_fmax:
3109       bi_fmax_to(b, sz, dst, s0, s1);
3110       break;
3111 
3112    case nir_op_iadd:
3113       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
3114       break;
3115 
3116    case nir_op_iadd_sat:
3117       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
3118       break;
3119 
3120    case nir_op_uadd_sat:
3121       bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
3122       break;
3123 
3124    case nir_op_ihadd:
3125       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
3126       break;
3127 
3128    case nir_op_irhadd:
3129       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
3130       break;
3131 
3132    case nir_op_uhadd:
3133       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
3134       break;
3135 
3136    case nir_op_urhadd:
3137       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
3138       break;
3139 
3140    case nir_op_ineg:
3141       bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
3142       break;
3143 
3144    case nir_op_isub:
3145       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
3146       break;
3147 
3148    case nir_op_isub_sat:
3149       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
3150       break;
3151 
3152    case nir_op_usub_sat:
3153       bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
3154       break;
3155 
3156    case nir_op_imul:
3157       bi_imul_to(b, sz, dst, s0, s1);
3158       break;
3159 
3160    case nir_op_iabs:
3161       bi_iabs_to(b, sz, dst, s0);
3162       break;
3163 
3164    case nir_op_iand:
3165       bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3166       break;
3167 
3168    case nir_op_ior:
3169       bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3170       break;
3171 
3172    case nir_op_ixor:
3173       bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3174       break;
3175 
3176    case nir_op_inot:
3177       bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
3178       break;
3179 
3180    case nir_op_frsq:
3181       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3182          bi_lower_frsq_32(b, dst, s0);
3183       else
3184          bi_frsq_to(b, sz, dst, s0);
3185       break;
3186 
3187    case nir_op_frcp:
3188       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3189          bi_lower_frcp_32(b, dst, s0);
3190       else
3191          bi_frcp_to(b, sz, dst, s0);
3192       break;
3193 
3194    case nir_op_uclz:
3195       bi_clz_to(b, sz, dst, s0, false);
3196       break;
3197 
3198    case nir_op_bit_count:
3199       assert(sz == 32 && src_sz == 32 && "should've been lowered");
3200       bi_popcount_i32_to(b, dst, s0);
3201       break;
3202 
3203    case nir_op_bitfield_reverse:
3204       assert(sz == 32 && src_sz == 32 && "should've been lowered");
3205       bi_bitrev_i32_to(b, dst, s0);
3206       break;
3207 
3208    case nir_op_ufind_msb: {
3209       bi_index clz = bi_clz(b, src_sz, s0, false);
3210 
3211       if (sz == 8)
3212          clz = bi_byte(clz, 0);
3213       else if (sz == 16)
3214          clz = bi_half(clz, false);
3215 
3216       bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3217       break;
3218    }
3219 
3220    default:
3221       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3222       unreachable("Unknown ALU op");
3223    }
3224 }
3225 
3226 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
3227  * Midgard */
3228 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)3229 bifrost_tex_format(enum glsl_sampler_dim dim)
3230 {
3231    switch (dim) {
3232    case GLSL_SAMPLER_DIM_1D:
3233    case GLSL_SAMPLER_DIM_BUF:
3234       return 1;
3235 
3236    case GLSL_SAMPLER_DIM_2D:
3237    case GLSL_SAMPLER_DIM_MS:
3238    case GLSL_SAMPLER_DIM_EXTERNAL:
3239    case GLSL_SAMPLER_DIM_RECT:
3240    case GLSL_SAMPLER_DIM_SUBPASS:
3241    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3242       return 2;
3243 
3244    case GLSL_SAMPLER_DIM_3D:
3245       return 3;
3246 
3247    case GLSL_SAMPLER_DIM_CUBE:
3248       return 0;
3249 
3250    default:
3251       DBG("Unknown sampler dim type\n");
3252       assert(0);
3253       return 0;
3254    }
3255 }
3256 
3257 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)3258 valhall_tex_dimension(enum glsl_sampler_dim dim)
3259 {
3260    switch (dim) {
3261    case GLSL_SAMPLER_DIM_1D:
3262    case GLSL_SAMPLER_DIM_BUF:
3263       return BI_DIMENSION_1D;
3264 
3265    case GLSL_SAMPLER_DIM_2D:
3266    case GLSL_SAMPLER_DIM_MS:
3267    case GLSL_SAMPLER_DIM_EXTERNAL:
3268    case GLSL_SAMPLER_DIM_RECT:
3269    case GLSL_SAMPLER_DIM_SUBPASS:
3270    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3271       return BI_DIMENSION_2D;
3272 
3273    case GLSL_SAMPLER_DIM_3D:
3274       return BI_DIMENSION_3D;
3275 
3276    case GLSL_SAMPLER_DIM_CUBE:
3277       return BI_DIMENSION_CUBE;
3278 
3279    default:
3280       unreachable("Unknown sampler dim type");
3281    }
3282 }
3283 
3284 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)3285 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3286 {
3287    switch (T) {
3288    case nir_type_float16:
3289       return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3290    case nir_type_float32:
3291       return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3292    case nir_type_uint16:
3293       return BIFROST_TEXTURE_FORMAT_U16;
3294    case nir_type_int16:
3295       return BIFROST_TEXTURE_FORMAT_S16;
3296    case nir_type_uint32:
3297       return BIFROST_TEXTURE_FORMAT_U32;
3298    case nir_type_int32:
3299       return BIFROST_TEXTURE_FORMAT_S32;
3300    default:
3301       unreachable("Invalid type for texturing");
3302    }
3303 }
3304 
3305 /* Array indices are specified as 32-bit uints, need to convert. In .z component
3306  * from NIR */
3307 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)3308 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3309 {
3310    /* For (u)int we can just passthrough */
3311    nir_alu_type base = nir_alu_type_get_base_type(T);
3312    if (base == nir_type_int || base == nir_type_uint)
3313       return idx;
3314 
3315    /* Otherwise we convert */
3316    assert(T == nir_type_float32);
3317 
3318    /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3319     * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3320     * 0, dt - 1). So we use round RTE, clamping is handled at the data
3321     * structure level */
3322 
3323    bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3324    I->round = BI_ROUND_NONE;
3325    return I->dest[0];
3326 }
3327 
3328 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3329  * 16-bit 8:8 fixed-point format. We lower as:
3330  *
3331  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3332  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3333  */
3334 
3335 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)3336 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3337 {
3338    /* Precompute for constant LODs to avoid general constant folding */
3339    if (lod.type == BI_INDEX_CONSTANT) {
3340       uint32_t raw = lod.value;
3341       float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3342       int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3343       return bi_imm_u32(s32 & 0xFFFF);
3344    }
3345 
3346    /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3347     * the max LOD (16 since we cap at 2^16 texture dimensions), and
3348     * preferably small to minimize precision loss */
3349    const float max_lod = 16.0;
3350 
3351    bi_instr *fsat =
3352       bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
3353                     bi_imm_f32(1.0f / max_lod), bi_negzero());
3354 
3355    fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3356 
3357    bi_index fmul =
3358       bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3359 
3360    return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3361                          bi_imm_u16(0));
3362 }
3363 
3364 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3365  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3366  * TODO: Cube face.
3367  */
3368 
3369 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3370 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3371 {
3372    return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3373 }
3374 
3375 /* The hardware specifies texel offsets and multisample indices together as a
3376  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3377  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3378  * the bits we need and return that to be passed as a staging register. Else we
3379  * return 0 to avoid allocating a data register when everything is zero. */
3380 
3381 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3382 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3383 {
3384    bi_index dest = bi_zero();
3385 
3386    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3387    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3388                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3389       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3390       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3391       dest = bi_mkvec_v4i8(
3392          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3393          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3394          (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3395          bi_imm_u8(0));
3396    }
3397 
3398    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3399    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3400                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3401       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3402                               bi_imm_u8(24));
3403    }
3404 
3405    return dest;
3406 }
3407 
3408 /*
3409  * Valhall specifies specifies texel offsets, multisample indices, and (for
3410  * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3411  * component is either offset.z or multisample index depending on context. Build
3412  * this register.
3413  */
3414 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3415 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3416 {
3417    bi_index dest = bi_zero();
3418 
3419    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3420    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3421    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3422 
3423    /* Components 0-2: offsets */
3424    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3425                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3426       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3427       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3428 
3429       /* No multisample index with 3D */
3430       assert((nr <= 2) || (ms_idx < 0));
3431 
3432       /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3433       bi_index z = (nr > 2)
3434                       ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3435                                       bi_imm_u8(0), bi_zero())
3436                       : bi_zero();
3437 
3438       dest = bi_mkvec_v2i8(
3439          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3440          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3441    }
3442 
3443    /* Component 2: multisample index */
3444    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3445                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3446       dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3447    }
3448 
3449    /* Component 3: 8-bit LOD */
3450    if (lod_idx >= 0 &&
3451        (!nir_src_is_const(instr->src[lod_idx].src) ||
3452         nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3453        nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3454       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3455                               bi_imm_u8(24));
3456    }
3457 
3458    return dest;
3459 }
3460 
3461 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3462 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3463                    bi_index *t)
3464 {
3465    /* Compute max { |x|, |y|, |z| } */
3466    bi_index maxxyz = bi_temp(b->shader);
3467    *face = bi_temp(b->shader);
3468 
3469    bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3470             cz = bi_extract(b, coord, 2);
3471 
3472    /* Use a pseudo op on Bifrost due to tuple restrictions */
3473    if (b->shader->arch <= 8) {
3474       bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3475    } else {
3476       bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3477       bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3478    }
3479 
3480    /* Select coordinates */
3481    bi_index ssel =
3482       bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3483    bi_index tsel =
3484       bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3485 
3486    /* The OpenGL ES specification requires us to transform an input vector
3487     * (x, y, z) to the coordinate, given the selected S/T:
3488     *
3489     * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3490     *
3491     * We implement (s shown, t similar) in a form friendlier to FMA
3492     * instructions, and clamp coordinates at the end for correct
3493     * NaN/infinity handling:
3494     *
3495     * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3496     *
3497     * Take the reciprocal of max{x, y, z}
3498     */
3499    bi_index rcp = bi_frcp_f32(b, maxxyz);
3500 
3501    /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3502    bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3503 
3504    /* Transform the coordinates */
3505    *s = bi_temp(b->shader);
3506    *t = bi_temp(b->shader);
3507 
3508    bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3509    bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3510 
3511    S->clamp = BI_CLAMP_CLAMP_0_1;
3512    T->clamp = BI_CLAMP_CLAMP_0_1;
3513 }
3514 
3515 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3516  * 32-bits in passed pointer t. The packing of the face with the S coordinate
3517  * exploits the redundancy of floating points with the range restriction of
3518  * CUBEFACE output.
3519  *
3520  *     struct cube_map_descriptor {
3521  *         float s : 29;
3522  *         unsigned face : 3;
3523  *         float t : 32;
3524  *     }
3525  *
3526  * Since the cube face index is preshifted, this is easy to pack with a bitwise
3527  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3528  * bits from face.
3529  */
3530 
3531 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3532 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3533 {
3534    bi_index face, s;
3535    bi_emit_cube_coord(b, coord, &face, &s, t);
3536    bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3537    return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3538 }
3539 
3540 /* Map to the main texture op used. Some of these (txd in particular) will
3541  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3542  * sequence). We assume that lowering is handled elsewhere.
3543  */
3544 
3545 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3546 bi_tex_op(nir_texop op)
3547 {
3548    switch (op) {
3549    case nir_texop_tex:
3550    case nir_texop_txb:
3551    case nir_texop_txl:
3552    case nir_texop_txd:
3553       return BIFROST_TEX_OP_TEX;
3554    case nir_texop_txf:
3555    case nir_texop_txf_ms:
3556    case nir_texop_tg4:
3557       return BIFROST_TEX_OP_FETCH;
3558    case nir_texop_lod:
3559       return BIFROST_TEX_OP_GRDESC;
3560    case nir_texop_txs:
3561    case nir_texop_query_levels:
3562    case nir_texop_texture_samples:
3563    case nir_texop_samples_identical:
3564       unreachable("should've been lowered");
3565    default:
3566       unreachable("unsupported tex op");
3567    }
3568 }
3569 
3570 /* Data registers required by texturing in the order they appear. All are
3571  * optional, the texture operation descriptor determines which are present.
3572  * Note since 3D arrays are not permitted at an API level, Z_COORD and
3573  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3574 
3575 enum bifrost_tex_dreg {
3576    BIFROST_TEX_DREG_Z_COORD = 0,
3577    BIFROST_TEX_DREG_Y_DELTAS = 1,
3578    BIFROST_TEX_DREG_LOD = 2,
3579    BIFROST_TEX_DREG_GRDESC_HI = 3,
3580    BIFROST_TEX_DREG_SHADOW = 4,
3581    BIFROST_TEX_DREG_ARRAY = 5,
3582    BIFROST_TEX_DREG_OFFSETMS = 6,
3583    BIFROST_TEX_DREG_SAMPLER = 7,
3584    BIFROST_TEX_DREG_TEXTURE = 8,
3585    BIFROST_TEX_DREG_COUNT,
3586 };
3587 
3588 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3589 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3590 {
3591    struct bifrost_texture_operation desc = {
3592       .op = bi_tex_op(instr->op),
3593       .offset_or_bias_disable = false, /* TODO */
3594       .shadow_or_clamp_disable = instr->is_shadow,
3595       .array = instr->is_array && instr->op != nir_texop_lod,
3596       .dimension = bifrost_tex_format(instr->sampler_dim),
3597       .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3598                                   BI_CLAMP_NONE), /* TODO */
3599       .mask = 0xF,
3600    };
3601 
3602    switch (desc.op) {
3603    case BIFROST_TEX_OP_TEX:
3604       desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3605       break;
3606    case BIFROST_TEX_OP_FETCH:
3607       desc.lod_or_fetch = (enum bifrost_lod_mode)(
3608          instr->op == nir_texop_tg4
3609             ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3610             : BIFROST_TEXTURE_FETCH_TEXEL);
3611       break;
3612    case BIFROST_TEX_OP_GRDESC:
3613       break;
3614    default:
3615       unreachable("texture op unsupported");
3616    }
3617 
3618    /* 32-bit indices to be allocated as consecutive staging registers */
3619    bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3620    bi_index cx = bi_null(), cy = bi_null();
3621    bi_index ddx = bi_null();
3622    bi_index ddy = bi_null();
3623 
3624    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3625       bi_index index = bi_src_index(&instr->src[i].src);
3626       unsigned sz = nir_src_bit_size(instr->src[i].src);
3627       unsigned components = nir_src_num_components(instr->src[i].src);
3628       ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3629       nir_alu_type T = base | sz;
3630 
3631       switch (instr->src[i].src_type) {
3632       case nir_tex_src_coord:
3633          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3634             cx = bi_emit_texc_cube_coord(b, index, &cy);
3635          } else {
3636             /* Copy XY (for 2D+) or XX (for 1D) */
3637             cx = bi_extract(b, index, 0);
3638             cy = bi_extract(b, index, MIN2(1, components - 1));
3639 
3640             assert(components >= 1 && components <= 3);
3641 
3642             if (components == 3 && !desc.array) {
3643                /* 3D */
3644                dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3645             }
3646          }
3647 
3648          if (desc.array) {
3649             dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3650                b, bi_extract(b, index, components - 1), T);
3651          }
3652 
3653          break;
3654 
3655       case nir_tex_src_lod:
3656          if (desc.op == BIFROST_TEX_OP_TEX &&
3657              nir_src_is_const(instr->src[i].src) &&
3658              nir_src_as_uint(instr->src[i].src) == 0) {
3659             desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3660          } else if (desc.op == BIFROST_TEX_OP_TEX) {
3661             assert(base == nir_type_float);
3662 
3663             assert(sz == 16 || sz == 32);
3664             dregs[BIFROST_TEX_DREG_LOD] =
3665                bi_emit_texc_lod_88(b, index, sz == 16);
3666             desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3667          } else {
3668             assert(desc.op == BIFROST_TEX_OP_FETCH);
3669             assert(base == nir_type_uint || base == nir_type_int);
3670             assert(sz == 16 || sz == 32);
3671 
3672             dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3673          }
3674 
3675          break;
3676 
3677       case nir_tex_src_ddx:
3678          ddx = index;
3679          break;
3680 
3681       case nir_tex_src_ddy:
3682          ddy = index;
3683          break;
3684 
3685       case nir_tex_src_bias:
3686          /* Upper 16-bits interpreted as a clamp, leave zero */
3687          assert(desc.op == BIFROST_TEX_OP_TEX);
3688          assert(base == nir_type_float);
3689          assert(sz == 16 || sz == 32);
3690          dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3691          desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3692          break;
3693 
3694       case nir_tex_src_ms_index:
3695       case nir_tex_src_offset:
3696          if (desc.offset_or_bias_disable)
3697             break;
3698 
3699          dregs[BIFROST_TEX_DREG_OFFSETMS] =
3700             bi_emit_texc_offset_ms_index(b, instr);
3701          if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3702             desc.offset_or_bias_disable = true;
3703          break;
3704 
3705       case nir_tex_src_comparator:
3706          dregs[BIFROST_TEX_DREG_SHADOW] = index;
3707          break;
3708 
3709       case nir_tex_src_texture_offset:
3710          dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3711          break;
3712 
3713       case nir_tex_src_sampler_offset:
3714          dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3715          break;
3716 
3717       default:
3718          unreachable("Unhandled src type in texc emit");
3719       }
3720    }
3721 
3722    if (desc.op == BIFROST_TEX_OP_FETCH &&
3723        bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3724       dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3725    }
3726 
3727    /* Choose an index mode */
3728 
3729    bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3730    bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3731    bool direct = direct_tex && direct_samp;
3732 
3733    desc.immediate_indices =
3734       direct && (instr->sampler_index < 16 && instr->texture_index < 128);
3735 
3736    if (desc.immediate_indices) {
3737       desc.sampler_index_or_mode = instr->sampler_index;
3738       desc.index = instr->texture_index;
3739    } else {
3740       unsigned mode = 0;
3741 
3742       if (direct && instr->sampler_index == instr->texture_index &&
3743           instr->sampler_index < 128) {
3744          mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3745          desc.index = instr->texture_index;
3746       } else if (direct && instr->sampler_index < 128) {
3747          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3748          desc.index = instr->sampler_index;
3749          dregs[BIFROST_TEX_DREG_TEXTURE] =
3750             bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3751       } else if (direct_tex && instr->texture_index < 128) {
3752          mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3753          desc.index = instr->texture_index;
3754 
3755          if (direct_samp) {
3756             dregs[BIFROST_TEX_DREG_SAMPLER] =
3757                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3758          }
3759       } else if (direct_samp && instr->sampler_index < 128) {
3760          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3761          desc.index = instr->sampler_index;
3762 
3763          if (direct_tex) {
3764             dregs[BIFROST_TEX_DREG_TEXTURE] =
3765                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3766          }
3767       } else {
3768          mode = BIFROST_INDEX_REGISTER;
3769 
3770          if (direct_tex) {
3771             dregs[BIFROST_TEX_DREG_TEXTURE] =
3772                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3773          }
3774 
3775          if (direct_samp) {
3776             dregs[BIFROST_TEX_DREG_SAMPLER] =
3777                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3778          }
3779       }
3780 
3781       mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3782       desc.sampler_index_or_mode = mode;
3783    }
3784 
3785    if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
3786       assert(!bi_is_null(ddx) && !bi_is_null(ddy));
3787       struct bifrost_texture_operation gropdesc = {
3788          .sampler_index_or_mode = desc.sampler_index_or_mode,
3789          .index = desc.index,
3790          .immediate_indices = desc.immediate_indices,
3791          .op = BIFROST_TEX_OP_GRDESC_DER,
3792          .offset_or_bias_disable = true,
3793          .shadow_or_clamp_disable = true,
3794          .array = false,
3795          .dimension = desc.dimension,
3796          .format = desc.format,
3797          .mask = desc.mask,
3798       };
3799 
3800       unsigned coords_comp_count =
3801          instr->coord_components -
3802          (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
3803       bi_index derivs[4];
3804       unsigned sr_count = 0;
3805 
3806       if (coords_comp_count > 2)
3807          derivs[sr_count++] = bi_extract(b, ddx, 2);
3808       derivs[sr_count++] = bi_extract(b, ddy, 0);
3809       if (coords_comp_count > 1)
3810          derivs[sr_count++] = bi_extract(b, ddy, 1);
3811       if (coords_comp_count > 2)
3812          derivs[sr_count++] = bi_extract(b, ddy, 2);
3813 
3814       bi_index derivs_packed = bi_temp(b->shader);
3815       bi_make_vec_to(b, derivs_packed, derivs, NULL, sr_count, 32);
3816       bi_index grdesc = bi_temp(b->shader);
3817       bi_instr *I =
3818          bi_texc_to(b, grdesc, derivs_packed, bi_extract(b, ddx, 0),
3819                     coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_zero(),
3820                     bi_imm_u32(gropdesc.packed), true, sr_count, 0);
3821       I->register_format = BI_REGISTER_FORMAT_U32;
3822 
3823       bi_emit_cached_split_i32(b, grdesc, 4);
3824 
3825       dregs[BIFROST_TEX_DREG_LOD] = bi_extract(b, grdesc, 0);
3826       desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3827    }
3828 
3829    /* Allocate staging registers contiguously by compacting the array. */
3830    unsigned sr_count = 0;
3831 
3832    for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3833       if (!bi_is_null(dregs[i]))
3834          dregs[sr_count++] = dregs[i];
3835    }
3836 
3837    unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3838 
3839    bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3840 
3841    if (sr_count)
3842       bi_emit_collect_to(b, sr, dregs, sr_count);
3843 
3844    if (instr->op == nir_texop_lod) {
3845       assert(instr->def.num_components == 2 && instr->def.bit_size == 32);
3846 
3847       bi_index res[2];
3848       for (unsigned i = 0; i < 2; i++) {
3849          desc.shadow_or_clamp_disable = i != 0;
3850 
3851          bi_index grdesc = bi_temp(b->shader);
3852          bi_instr *I = bi_texc_to(b, grdesc, sr, cx, cy,
3853                                   bi_imm_u32(desc.packed), false, sr_count, 0);
3854          I->register_format = BI_REGISTER_FORMAT_U32;
3855 
3856          bi_emit_cached_split_i32(b, grdesc, 4);
3857 
3858          bi_index lod = bi_s16_to_f32(b, bi_half(bi_extract(b, grdesc, 0), 0));
3859 
3860          lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));
3861 
3862          if (i == 0)
3863             lod = bi_fround_f32(b, lod, BI_ROUND_NONE);
3864 
3865          res[i] = lod;
3866       }
3867 
3868       bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
3869       return;
3870    }
3871 
3872    bi_index dst = bi_temp(b->shader);
3873 
3874    bi_instr *I =
3875       bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc.packed),
3876                  !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3877    I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3878 
3879    bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3880    bi_emit_split_i32(b, w, dst, res_size);
3881    bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3882                       DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3883 }
3884 
3885 /* Staging registers required by texturing in the order they appear (Valhall) */
3886 
3887 enum valhall_tex_sreg {
3888    VALHALL_TEX_SREG_X_COORD = 0,
3889    VALHALL_TEX_SREG_Y_COORD = 1,
3890    VALHALL_TEX_SREG_Z_COORD = 2,
3891    VALHALL_TEX_SREG_Y_DELTAS = 3,
3892    VALHALL_TEX_SREG_ARRAY = 4,
3893    VALHALL_TEX_SREG_SHADOW = 5,
3894    VALHALL_TEX_SREG_OFFSETMS = 6,
3895    VALHALL_TEX_SREG_LOD = 7,
3896    VALHALL_TEX_SREG_GRDESC0 = 8,
3897    VALHALL_TEX_SREG_GRDESC1 = 9,
3898    VALHALL_TEX_SREG_COUNT,
3899 };
3900 
3901 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3902 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3903 {
3904    bool explicit_offset = false;
3905    enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3906 
3907    bool has_lod_mode = (instr->op == nir_texop_tex) ||
3908                        (instr->op == nir_texop_txl) ||
3909                        (instr->op == nir_texop_txd) ||
3910                        (instr->op == nir_texop_txb);
3911 
3912    /* 32-bit indices to be allocated as consecutive staging registers */
3913    bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3914    bi_index sampler = bi_imm_u32(instr->sampler_index);
3915    bi_index texture = bi_imm_u32(instr->texture_index);
3916    bi_index ddx = bi_null();
3917    bi_index ddy = bi_null();
3918 
3919    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3920       bi_index index = bi_src_index(&instr->src[i].src);
3921       unsigned sz = nir_src_bit_size(instr->src[i].src);
3922 
3923       switch (instr->src[i].src_type) {
3924       case nir_tex_src_coord: {
3925          bool is_array = instr->is_array && instr->op != nir_texop_lod;
3926          unsigned components = nir_tex_instr_src_size(instr, i) - is_array;
3927 
3928          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3929             sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3930                b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3931          } else {
3932             assert(components >= 1 && components <= 3);
3933 
3934             /* Copy XY (for 2D+) or XX (for 1D) */
3935             sregs[VALHALL_TEX_SREG_X_COORD] = index;
3936 
3937             if (components >= 2)
3938                sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3939 
3940             if (components == 3)
3941                sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3942          }
3943 
3944          if (is_array)
3945             sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components);
3946 
3947          break;
3948       }
3949 
3950       case nir_tex_src_lod:
3951          if (nir_src_is_const(instr->src[i].src) &&
3952              nir_src_as_uint(instr->src[i].src) == 0) {
3953             lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3954          } else if (has_lod_mode) {
3955             lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3956 
3957             assert(sz == 16 || sz == 32);
3958             sregs[VALHALL_TEX_SREG_LOD] =
3959                bi_emit_texc_lod_88(b, index, sz == 16);
3960          }
3961          break;
3962 
3963       case nir_tex_src_ddx:
3964 	 ddx = index;
3965 	 break;
3966 
3967       case nir_tex_src_ddy:
3968 	 ddy = index;
3969 	 break;
3970 
3971       case nir_tex_src_bias:
3972          /* Upper 16-bits interpreted as a clamp, leave zero */
3973          assert(sz == 16 || sz == 32);
3974          sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3975 
3976          lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3977          break;
3978       case nir_tex_src_ms_index:
3979       case nir_tex_src_offset:
3980          /* Handled below */
3981          break;
3982 
3983       case nir_tex_src_comparator:
3984          sregs[VALHALL_TEX_SREG_SHADOW] = index;
3985          break;
3986 
3987       case nir_tex_src_texture_offset:
3988          /* This should always be 0 as lower_index_to_offset is expected to be
3989           * set */
3990          assert(instr->texture_index == 0);
3991          texture = index;
3992          break;
3993 
3994       case nir_tex_src_sampler_offset:
3995          /* This should always be 0 as lower_index_to_offset is expected to be
3996           * set */
3997          assert(instr->sampler_index == 0);
3998          sampler = index;
3999          break;
4000 
4001       default:
4002          unreachable("Unhandled src type in tex emit");
4003       }
4004    }
4005 
4006    /* Generate packed offset + ms index + LOD register. These default to
4007     * zero so we only need to encode if these features are actually in use.
4008     */
4009    bi_index offsets = bi_emit_valhall_offsets(b, instr);
4010 
4011    if (!bi_is_equiv(offsets, bi_zero())) {
4012       sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
4013       explicit_offset = true;
4014    }
4015 
4016    bool narrow_indices = va_is_valid_const_narrow_index(texture) &&
4017                          va_is_valid_const_narrow_index(sampler);
4018 
4019    bi_index src0;
4020    bi_index src1;
4021 
4022    if (narrow_indices) {
4023       unsigned tex_set =
4024          va_res_fold_table_idx(pan_res_handle_get_table(texture.value));
4025       unsigned sampler_set =
4026          va_res_fold_table_idx(pan_res_handle_get_table(sampler.value));
4027       unsigned texture_index = pan_res_handle_get_index(texture.value);
4028       unsigned sampler_index = pan_res_handle_get_index(sampler.value);
4029 
4030       unsigned packed_handle = (tex_set << 27) | (texture_index << 16) |
4031                                (sampler_set << 11) | sampler_index;
4032 
4033       src0 = bi_imm_u32(packed_handle);
4034 
4035       /* TODO: narrow offsetms */
4036       src1 = bi_zero();
4037    } else {
4038       src0 = sampler;
4039       src1 = texture;
4040    }
4041 
4042    enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
4043 
4044    if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
4045       unsigned coords_comp_count =
4046          instr->coord_components -
4047          (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
4048       assert(!bi_is_null(ddx) && !bi_is_null(ddy));
4049 
4050       lod_mode = BI_VA_LOD_MODE_GRDESC;
4051 
4052       bi_index derivs[6] = {
4053          bi_extract(b, ddx, 0),
4054          bi_extract(b, ddy, 0),
4055          coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_null(),
4056          coords_comp_count > 1 ? bi_extract(b, ddy, 1) : bi_null(),
4057          coords_comp_count > 2 ? bi_extract(b, ddx, 2) : bi_null(),
4058          coords_comp_count > 2 ? bi_extract(b, ddy, 2) : bi_null(),
4059       };
4060       bi_index derivs_packed = bi_temp(b->shader);
4061       bi_make_vec_to(b, derivs_packed, derivs, NULL, coords_comp_count * 2, 32);
4062       bi_index grdesc = bi_temp(b->shader);
4063       bi_instr *I = bi_tex_gradient_to(b, grdesc, derivs_packed, src0, src1, dim,
4064                                        !narrow_indices, 3, coords_comp_count * 2);
4065       I->derivative_enable = true;
4066       I->force_delta_enable = false;
4067       I->lod_clamp_disable = true;
4068       I->lod_bias_disable = true;
4069       I->register_format = BI_REGISTER_FORMAT_U32;
4070 
4071       bi_emit_cached_split_i32(b, grdesc, 2);
4072       sregs[VALHALL_TEX_SREG_GRDESC0] = bi_extract(b, grdesc, 0);
4073       sregs[VALHALL_TEX_SREG_GRDESC1] = bi_extract(b, grdesc, 1);
4074    }
4075 
4076    /* Allocate staging registers contiguously by compacting the array. */
4077    unsigned sr_count = 0;
4078    for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
4079       if (!bi_is_null(sregs[i]))
4080          sregs[sr_count++] = sregs[i];
4081    }
4082 
4083    bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
4084 
4085    if (sr_count)
4086       bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
4087 
4088    if (instr->op == nir_texop_lod) {
4089       assert(instr->def.num_components == 2 && instr->def.bit_size == 32);
4090 
4091       bi_index res[2];
4092 
4093       for (unsigned i = 0; i < 2; i++) {
4094          bi_index grdesc = bi_temp(b->shader);
4095          bi_instr *I = bi_tex_gradient_to(b, grdesc, idx, src0, src1, dim,
4096                                           !narrow_indices, 1, sr_count);
4097          I->derivative_enable = false;
4098          I->force_delta_enable = true;
4099          I->lod_clamp_disable = i != 0;
4100          I->register_format = BI_REGISTER_FORMAT_U32;
4101          bi_index lod = bi_s16_to_f32(b, bi_half(grdesc, 0));
4102 
4103          lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));
4104 
4105          if (i == 0)
4106             lod = bi_fround_f32(b, lod, BI_ROUND_NONE);
4107 
4108          res[i] = lod;
4109       }
4110 
4111       bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
4112       return;
4113    }
4114 
4115    /* Only write the components that we actually read */
4116    unsigned mask = nir_def_components_read(&instr->def);
4117    unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
4118    unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
4119 
4120    enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
4121    bi_index dest = bi_temp(b->shader);
4122 
4123    switch (instr->op) {
4124    case nir_texop_tex:
4125    case nir_texop_txb:
4126    case nir_texop_txl:
4127    case nir_texop_txd:
4128       bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
4129                        instr->is_shadow, explicit_offset, lod_mode,
4130                        !narrow_indices, mask, sr_count);
4131       break;
4132    case nir_texop_txf:
4133    case nir_texop_txf_ms: {
4134       /* On Valhall, TEX_FETCH doesn't have CUBE support. This is not a problem
4135        * as a cube is just a 2D array in any cases. */
4136       if (dim == BI_DIMENSION_CUBE)
4137          dim = BI_DIMENSION_2D;
4138 
4139       bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
4140                       explicit_offset, !narrow_indices, mask, sr_count);
4141       break;
4142    }
4143    case nir_texop_tg4:
4144       bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
4145                        instr->component, false, regfmt, instr->is_shadow,
4146                        explicit_offset, !narrow_indices, mask, sr_count);
4147       break;
4148    default:
4149       unreachable("Unhandled Valhall texture op");
4150    }
4151 
4152    /* The hardware will write only what we read, and it will into
4153     * contiguous registers without gaps (different from Bifrost). NIR
4154     * expects the gaps, so fill in the holes (they'll be copypropped and
4155     * DCE'd away later).
4156     */
4157    bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
4158 
4159    bi_emit_cached_split_i32(b, dest, res_size);
4160 
4161    /* Index into the packed component array */
4162    unsigned j = 0;
4163    unsigned comps[4] = {0};
4164    unsigned nr_components = instr->def.num_components;
4165 
4166    for (unsigned i = 0; i < nr_components; ++i) {
4167       if (mask & BITFIELD_BIT(i)) {
4168          unpacked[i] = dest;
4169          comps[i] = j++;
4170       } else {
4171          unpacked[i] = bi_zero();
4172       }
4173    }
4174 
4175    bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
4176                   instr->def.num_components, instr->def.bit_size);
4177 }
4178 
4179 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
4180  * textures with sufficiently small immediate indices. Anything else
4181  * needs a complete texture op. */
4182 
4183 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)4184 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
4185 {
4186    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
4187    assert(coord_idx >= 0);
4188    bi_index coords = bi_src_index(&instr->src[coord_idx].src);
4189 
4190    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
4191       bi_index face, s, t;
4192       bi_emit_cube_coord(b, coords, &face, &s, &t);
4193 
4194       bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
4195                       face, instr->sampler_index, instr->texture_index);
4196    } else {
4197       bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
4198                     bi_extract(b, coords, 0), bi_extract(b, coords, 1),
4199                     instr->op != nir_texop_tex, /* zero LOD */
4200                     instr->sampler_index, instr->texture_index);
4201    }
4202 
4203    bi_split_def(b, &instr->def);
4204 }
4205 
4206 static bool
bi_is_simple_tex(nir_tex_instr * instr)4207 bi_is_simple_tex(nir_tex_instr *instr)
4208 {
4209    if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
4210       return false;
4211 
4212    if (instr->dest_type != nir_type_float32 &&
4213        instr->dest_type != nir_type_float16)
4214       return false;
4215 
4216    if (instr->is_shadow || instr->is_array)
4217       return false;
4218 
4219    switch (instr->sampler_dim) {
4220    case GLSL_SAMPLER_DIM_2D:
4221    case GLSL_SAMPLER_DIM_EXTERNAL:
4222    case GLSL_SAMPLER_DIM_RECT:
4223       break;
4224 
4225    case GLSL_SAMPLER_DIM_CUBE:
4226       /* LOD can't be specified with TEXS_CUBE */
4227       if (instr->op == nir_texop_txl)
4228          return false;
4229       break;
4230 
4231    default:
4232       return false;
4233    }
4234 
4235    for (unsigned i = 0; i < instr->num_srcs; ++i) {
4236       if (instr->src[i].src_type != nir_tex_src_lod &&
4237           instr->src[i].src_type != nir_tex_src_coord)
4238          return false;
4239    }
4240 
4241    /* Indices need to fit in provided bits */
4242    unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
4243    if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
4244       return false;
4245 
4246    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
4247    if (lod_idx < 0)
4248       return true;
4249 
4250    nir_src lod = instr->src[lod_idx].src;
4251    return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
4252 }
4253 
4254 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)4255 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
4256 {
4257    /* If txf is used, we assume there is a valid sampler bound at index 0. Use
4258     * it for txf operations, since there may be no other valid samplers. This is
4259     * a workaround: txf does not require a sampler in NIR (so sampler_index is
4260     * undefined) but we need one in the hardware. This is ABI with the driver.
4261     *
4262     * On Valhall, as the descriptor table is encoded in the index, this should
4263     * be handled by the driver.
4264     */
4265    if (!nir_tex_instr_need_sampler(instr) && b->shader->arch < 9)
4266       instr->sampler_index = 0;
4267 
4268    if (b->shader->arch >= 9)
4269       bi_emit_tex_valhall(b, instr);
4270    else if (bi_is_simple_tex(instr))
4271       bi_emit_texs(b, instr);
4272    else
4273       bi_emit_texc(b, instr);
4274 }
4275 
4276 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)4277 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
4278 {
4279    unsigned nr_srcs = exec_list_length(&instr->srcs);
4280    bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
4281 
4282    /* Deferred */
4283    I->phi = instr;
4284 }
4285 
4286 /* Look up the AGX block corresponding to a given NIR block. Used when
4287  * translating phi nodes after emitting all blocks.
4288  */
4289 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)4290 bi_from_nir_block(bi_context *ctx, nir_block *block)
4291 {
4292    return ctx->indexed_nir_blocks[block->index];
4293 }
4294 
4295 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)4296 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
4297 {
4298    nir_phi_instr *phi = I->phi;
4299 
4300    /* Guaranteed by lower_phis_to_scalar */
4301    assert(phi->def.num_components == 1);
4302 
4303    nir_foreach_phi_src(src, phi) {
4304       bi_block *pred = bi_from_nir_block(ctx, src->pred);
4305       unsigned i = bi_predecessor_index(block, pred);
4306       assert(i < I->nr_srcs);
4307 
4308       I->src[i] = bi_src_index(&src->src);
4309    }
4310 
4311    I->phi = NULL;
4312 }
4313 
4314 static void
bi_emit_phis_deferred(bi_context * ctx)4315 bi_emit_phis_deferred(bi_context *ctx)
4316 {
4317    bi_foreach_block(ctx, block) {
4318       bi_foreach_instr_in_block(block, I) {
4319          if (I->op == BI_OPCODE_PHI)
4320             bi_emit_phi_deferred(ctx, block, I);
4321       }
4322    }
4323 }
4324 
4325 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)4326 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
4327 {
4328    switch (instr->type) {
4329    case nir_instr_type_load_const:
4330       bi_emit_load_const(b, nir_instr_as_load_const(instr));
4331       break;
4332 
4333    case nir_instr_type_intrinsic:
4334       bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
4335       break;
4336 
4337    case nir_instr_type_alu:
4338       bi_emit_alu(b, nir_instr_as_alu(instr));
4339       break;
4340 
4341    case nir_instr_type_tex:
4342       bi_emit_tex(b, nir_instr_as_tex(instr));
4343       break;
4344 
4345    case nir_instr_type_jump:
4346       bi_emit_jump(b, nir_instr_as_jump(instr));
4347       break;
4348 
4349    case nir_instr_type_phi:
4350       bi_emit_phi(b, nir_instr_as_phi(instr));
4351       break;
4352 
4353    default:
4354       unreachable("should've been lowered");
4355    }
4356 }
4357 
4358 static bi_block *
create_empty_block(bi_context * ctx)4359 create_empty_block(bi_context *ctx)
4360 {
4361    bi_block *blk = rzalloc(ctx, bi_block);
4362 
4363    util_dynarray_init(&blk->predecessors, blk);
4364 
4365    return blk;
4366 }
4367 
4368 static bi_block *
emit_block(bi_context * ctx,nir_block * block)4369 emit_block(bi_context *ctx, nir_block *block)
4370 {
4371    if (ctx->after_block) {
4372       ctx->current_block = ctx->after_block;
4373       ctx->after_block = NULL;
4374    } else {
4375       ctx->current_block = create_empty_block(ctx);
4376    }
4377 
4378    list_addtail(&ctx->current_block->link, &ctx->blocks);
4379    list_inithead(&ctx->current_block->instructions);
4380 
4381    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4382 
4383    ctx->indexed_nir_blocks[block->index] = ctx->current_block;
4384 
4385    nir_foreach_instr(instr, block) {
4386       bi_emit_instr(&_b, instr);
4387    }
4388 
4389    return ctx->current_block;
4390 }
4391 
4392 static void
emit_if(bi_context * ctx,nir_if * nif)4393 emit_if(bi_context *ctx, nir_if *nif)
4394 {
4395    bi_block *before_block = ctx->current_block;
4396 
4397    /* Speculatively emit the branch, but we can't fill it in until later */
4398    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4399    bi_instr *then_branch =
4400       bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
4401                      bi_zero(), BI_CMPF_EQ);
4402 
4403    /* Emit the two subblocks. */
4404    bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
4405    bi_block *end_then_block = ctx->current_block;
4406 
4407    /* Emit second block */
4408 
4409    bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
4410    bi_block *end_else_block = ctx->current_block;
4411    ctx->after_block = create_empty_block(ctx);
4412 
4413    /* Now that we have the subblocks emitted, fix up the branches */
4414 
4415    assert(then_block);
4416    assert(else_block);
4417 
4418    then_branch->branch_target = else_block;
4419 
4420    /* Emit a jump from the end of the then block to the end of the else */
4421    _b.cursor = bi_after_block(end_then_block);
4422    bi_instr *then_exit = bi_jump(&_b, bi_zero());
4423    then_exit->branch_target = ctx->after_block;
4424 
4425    bi_block_add_successor(end_then_block, then_exit->branch_target);
4426    bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
4427 
4428    bi_block_add_successor(before_block,
4429                           then_branch->branch_target); /* then_branch */
4430    bi_block_add_successor(before_block, then_block);   /* fallthrough */
4431 }
4432 
4433 static void
emit_loop(bi_context * ctx,nir_loop * nloop)4434 emit_loop(bi_context *ctx, nir_loop *nloop)
4435 {
4436    assert(!nir_loop_has_continue_construct(nloop));
4437 
4438    /* Remember where we are */
4439    bi_block *start_block = ctx->current_block;
4440 
4441    bi_block *saved_break = ctx->break_block;
4442    bi_block *saved_continue = ctx->continue_block;
4443 
4444    ctx->continue_block = create_empty_block(ctx);
4445    ctx->break_block = create_empty_block(ctx);
4446    ctx->after_block = ctx->continue_block;
4447    ctx->after_block->loop_header = true;
4448 
4449    /* Emit the body itself */
4450    emit_cf_list(ctx, &nloop->body);
4451 
4452    /* Branch back to loop back */
4453    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4454    bi_instr *I = bi_jump(&_b, bi_zero());
4455    I->branch_target = ctx->continue_block;
4456    bi_block_add_successor(start_block, ctx->continue_block);
4457    bi_block_add_successor(ctx->current_block, ctx->continue_block);
4458 
4459    ctx->after_block = ctx->break_block;
4460 
4461    /* Pop off */
4462    ctx->break_block = saved_break;
4463    ctx->continue_block = saved_continue;
4464    ++ctx->loop_count;
4465 }
4466 
4467 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)4468 emit_cf_list(bi_context *ctx, struct exec_list *list)
4469 {
4470    bi_block *start_block = NULL;
4471 
4472    foreach_list_typed(nir_cf_node, node, node, list) {
4473       switch (node->type) {
4474       case nir_cf_node_block: {
4475          bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4476 
4477          if (!start_block)
4478             start_block = block;
4479 
4480          break;
4481       }
4482 
4483       case nir_cf_node_if:
4484          emit_if(ctx, nir_cf_node_as_if(node));
4485          break;
4486 
4487       case nir_cf_node_loop:
4488          emit_loop(ctx, nir_cf_node_as_loop(node));
4489          break;
4490 
4491       default:
4492          unreachable("Unknown control flow");
4493       }
4494    }
4495 
4496    return start_block;
4497 }
4498 
4499 /* shader-db stuff */
4500 
4501 struct bi_stats {
4502    unsigned nr_clauses, nr_tuples, nr_ins;
4503    unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4504 };
4505 
4506 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)4507 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4508 {
4509    /* Count instructions */
4510    stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4511 
4512    /* Non-message passing tuples are always arithmetic */
4513    if (tuple->add != clause->message) {
4514       stats->nr_arith++;
4515       return;
4516    }
4517 
4518    /* Message + FMA we'll count as arithmetic _and_ message */
4519    if (tuple->fma)
4520       stats->nr_arith++;
4521 
4522    switch (clause->message_type) {
4523    case BIFROST_MESSAGE_VARYING:
4524       /* Check components interpolated */
4525       stats->nr_varying +=
4526          (clause->message->vecsize + 1) *
4527          (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4528       break;
4529 
4530    case BIFROST_MESSAGE_VARTEX:
4531       /* 2 coordinates, fp32 each */
4532       stats->nr_varying += (2 * 2);
4533       FALLTHROUGH;
4534    case BIFROST_MESSAGE_TEX:
4535       stats->nr_texture++;
4536       break;
4537 
4538    case BIFROST_MESSAGE_ATTRIBUTE:
4539    case BIFROST_MESSAGE_LOAD:
4540    case BIFROST_MESSAGE_STORE:
4541    case BIFROST_MESSAGE_ATOMIC:
4542       stats->nr_ldst++;
4543       break;
4544 
4545    case BIFROST_MESSAGE_NONE:
4546    case BIFROST_MESSAGE_BARRIER:
4547    case BIFROST_MESSAGE_BLEND:
4548    case BIFROST_MESSAGE_TILE:
4549    case BIFROST_MESSAGE_Z_STENCIL:
4550    case BIFROST_MESSAGE_ATEST:
4551    case BIFROST_MESSAGE_JOB:
4552    case BIFROST_MESSAGE_64BIT:
4553       /* Nothing to do */
4554       break;
4555    };
4556 }
4557 
4558 /*
4559  * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4560  * shader completes. These costs are not accounted for in the general cycle
4561  * counts, so this function calculates the effective cost of these messages, as
4562  * if they were executed by shader code.
4563  */
4564 static unsigned
bi_count_preload_cost(bi_context * ctx)4565 bi_count_preload_cost(bi_context *ctx)
4566 {
4567    /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4568     * 16 fp16 varying components per cycle or fetch two texels per cycle.
4569     */
4570    unsigned cost = 0;
4571 
4572    for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4573       struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4574 
4575       if (msg.enabled && msg.texture) {
4576          /* 2 coordinate, 2 half-words each, plus texture */
4577          cost += 12;
4578       } else if (msg.enabled) {
4579          cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4580       }
4581    }
4582 
4583    return cost;
4584 }
4585 
4586 static const char *
bi_shader_stage_name(bi_context * ctx)4587 bi_shader_stage_name(bi_context *ctx)
4588 {
4589    if (ctx->idvs == BI_IDVS_VARYING)
4590       return "MESA_SHADER_VARYING";
4591    else if (ctx->idvs == BI_IDVS_POSITION)
4592       return "MESA_SHADER_POSITION";
4593    else if (ctx->inputs->is_blend)
4594       return "MESA_SHADER_BLEND";
4595    else
4596       return gl_shader_stage_name(ctx->stage);
4597 }
4598 
4599 static char *
bi_print_stats(bi_context * ctx,unsigned size)4600 bi_print_stats(bi_context *ctx, unsigned size)
4601 {
4602    struct bi_stats stats = {0};
4603 
4604    /* Count instructions, clauses, and tuples. Also attempt to construct
4605     * normalized execution engine cycle counts, using the following ratio:
4606     *
4607     * 24 arith tuples/cycle
4608     * 2 texture messages/cycle
4609     * 16 x 16-bit varying channels interpolated/cycle
4610     * 1 load store message/cycle
4611     *
4612     * These numbers seem to match Arm Mobile Studio's heuristic. The real
4613     * cycle counts are surely more complicated.
4614     */
4615 
4616    bi_foreach_block(ctx, block) {
4617       bi_foreach_clause_in_block(block, clause) {
4618          stats.nr_clauses++;
4619          stats.nr_tuples += clause->tuple_count;
4620 
4621          for (unsigned i = 0; i < clause->tuple_count; ++i)
4622             bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4623       }
4624    }
4625 
4626    float cycles_arith = ((float)stats.nr_arith) / 24.0;
4627    float cycles_texture = ((float)stats.nr_texture) / 2.0;
4628    float cycles_varying = ((float)stats.nr_varying) / 16.0;
4629    float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4630 
4631    float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4632    float cycles_bound = MAX2(cycles_arith, cycles_message);
4633 
4634    /* Thread count and register pressure are traded off only on v7 */
4635    bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4636    unsigned nr_threads = full_threads ? 2 : 1;
4637 
4638    /* Dump stats */
4639    char *str = ralloc_asprintf(
4640       NULL,
4641       "%s shader: "
4642       "%u inst, %u tuples, %u clauses, "
4643       "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4644       "%u quadwords, %u threads",
4645       bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4646       stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4647       cycles_varying, cycles_ldst, size / 16, nr_threads);
4648 
4649    if (ctx->arch == 7) {
4650       ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4651    }
4652 
4653    ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4654                           ctx->loop_count, ctx->spills, ctx->fills);
4655 
4656    return str;
4657 }
4658 
4659 static char *
va_print_stats(bi_context * ctx,unsigned size)4660 va_print_stats(bi_context *ctx, unsigned size)
4661 {
4662    unsigned nr_ins = 0;
4663    struct va_stats stats = {0};
4664 
4665    /* Count instructions */
4666    bi_foreach_instr_global(ctx, I) {
4667       nr_ins++;
4668       va_count_instr_stats(I, &stats);
4669    }
4670 
4671    /* Mali G78 peak performance:
4672     *
4673     * 64 FMA instructions per cycle
4674     * 64 CVT instructions per cycle
4675     * 16 SFU instructions per cycle
4676     * 8 x 32-bit varying channels interpolated per cycle
4677     * 4 texture instructions per cycle
4678     * 1 load/store operation per cycle
4679     */
4680 
4681    float cycles_fma = ((float)stats.fma) / 64.0;
4682    float cycles_cvt = ((float)stats.cvt) / 64.0;
4683    float cycles_sfu = ((float)stats.sfu) / 16.0;
4684    float cycles_v = ((float)stats.v) / 16.0;
4685    float cycles_t = ((float)stats.t) / 4.0;
4686    float cycles_ls = ((float)stats.ls) / 1.0;
4687 
4688    /* Calculate the bound */
4689    float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4690                        MAX3(cycles_v, cycles_t, cycles_ls));
4691 
4692    /* Thread count and register pressure are traded off */
4693    unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4694 
4695    /* Dump stats */
4696    return ralloc_asprintf(NULL,
4697                           "%s shader: "
4698                           "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4699                           "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4700                           "%u:%u spills:fills",
4701                           bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4702                           cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4703                           size / 16, nr_threads, ctx->loop_count, ctx->spills,
4704                           ctx->fills);
4705 }
4706 
4707 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4708 glsl_type_size(const struct glsl_type *type, bool bindless)
4709 {
4710    return glsl_count_attribute_slots(type, false);
4711 }
4712 
4713 /* Split stores to memory. We don't split stores to vertex outputs, since
4714  * nir_lower_io_to_temporaries will ensure there's only a single write.
4715  */
4716 
4717 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4718 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4719 {
4720    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4721 
4722    switch (intr->intrinsic) {
4723    case nir_intrinsic_store_ssbo:
4724    case nir_intrinsic_store_shared:
4725    case nir_intrinsic_store_global:
4726    case nir_intrinsic_store_scratch:
4727       return true;
4728    default:
4729       return false;
4730    }
4731 }
4732 
4733 /*
4734  * Some operations are only available as 32-bit instructions. 64-bit floats are
4735  * unsupported and ints are lowered with nir_lower_int64.  Certain 8-bit and
4736  * 16-bit instructions, however, are lowered here.
4737  */
4738 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4739 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4740 {
4741    if (instr->type != nir_instr_type_alu)
4742       return 0;
4743 
4744    nir_alu_instr *alu = nir_instr_as_alu(instr);
4745 
4746    switch (alu->op) {
4747    case nir_op_fexp2:
4748    case nir_op_flog2:
4749    case nir_op_fpow:
4750    case nir_op_fsin:
4751    case nir_op_fcos:
4752    case nir_op_bit_count:
4753    case nir_op_bitfield_reverse:
4754       return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4755    default:
4756       return 0;
4757    }
4758 }
4759 
4760 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4761  * transcendentals are an exception. Also shifts because of lane size mismatch
4762  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4763  * to be scalarized due to type size. */
4764 
4765 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4766 bi_vectorize_filter(const nir_instr *instr, const void *data)
4767 {
4768    /* Defaults work for everything else */
4769    if (instr->type != nir_instr_type_alu)
4770       return 0;
4771 
4772    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4773 
4774    switch (alu->op) {
4775    case nir_op_frcp:
4776    case nir_op_frsq:
4777    case nir_op_ishl:
4778    case nir_op_ishr:
4779    case nir_op_ushr:
4780    case nir_op_f2i16:
4781    case nir_op_f2u16:
4782    case nir_op_extract_u8:
4783    case nir_op_extract_i8:
4784    case nir_op_extract_u16:
4785    case nir_op_extract_i16:
4786    case nir_op_insert_u16:
4787       return 1;
4788    default:
4789       break;
4790    }
4791 
4792    /* Vectorized instructions cannot write more than 32-bit */
4793    int dst_bit_size = alu->def.bit_size;
4794    if (dst_bit_size == 16)
4795       return 2;
4796    else
4797       return 1;
4798 }
4799 
4800 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4801 bi_scalarize_filter(const nir_instr *instr, const void *data)
4802 {
4803    if (instr->type != nir_instr_type_alu)
4804       return false;
4805 
4806    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4807 
4808    switch (alu->op) {
4809    case nir_op_pack_uvec2_to_uint:
4810    case nir_op_pack_uvec4_to_uint:
4811       return false;
4812    default:
4813       return true;
4814    }
4815 }
4816 
4817 /* Ensure we write exactly 4 components */
4818 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4819 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4820                           unsigned first, unsigned mask)
4821 {
4822    if (!(mask & BITFIELD_BIT(channel)))
4823       channel = first;
4824 
4825    return nir_channel(b, in, channel);
4826 }
4827 
4828 /* Lower fragment store_output instructions to always write 4 components,
4829  * matching the hardware semantic. This may require additional moves. Skipping
4830  * these moves is possible in theory, but invokes undefined behaviour in the
4831  * compiler. The DDK inserts these moves, so we will as well. */
4832 
4833 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4834 bifrost_nir_lower_blend_components(struct nir_builder *b,
4835                                    nir_intrinsic_instr *intr, void *data)
4836 {
4837    if (intr->intrinsic != nir_intrinsic_store_output)
4838       return false;
4839 
4840    nir_def *in = intr->src[0].ssa;
4841    unsigned first = nir_intrinsic_component(intr);
4842    unsigned mask = nir_intrinsic_write_mask(intr);
4843 
4844    assert(first == 0 && "shouldn't get nonzero components");
4845 
4846    /* Nothing to do */
4847    if (mask == BITFIELD_MASK(4))
4848       return false;
4849 
4850    b->cursor = nir_before_instr(&intr->instr);
4851 
4852    /* Replicate the first valid component instead */
4853    nir_def *replicated =
4854       nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4855                bifrost_nir_valid_channel(b, in, 1, first, mask),
4856                bifrost_nir_valid_channel(b, in, 2, first, mask),
4857                bifrost_nir_valid_channel(b, in, 3, first, mask));
4858 
4859    /* Rewrite to use our replicated version */
4860    nir_src_rewrite(&intr->src[0], replicated);
4861    nir_intrinsic_set_component(intr, 0);
4862    nir_intrinsic_set_write_mask(intr, 0xF);
4863    intr->num_components = 4;
4864 
4865    return true;
4866 }
4867 
4868 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)4869 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4870                          uint8_t bit_size, uint32_t align_mul,
4871                          uint32_t align_offset, bool offset_is_const,
4872                          enum gl_access_qualifier access, const void *cb_data)
4873 {
4874    uint32_t align = nir_combined_align(align_mul, align_offset);
4875    assert(util_is_power_of_two_nonzero(align));
4876 
4877    /* No more than 16 bytes at a time. */
4878    bytes = MIN2(bytes, 16);
4879 
4880    /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4881     * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4882     *
4883     * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4884     * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4885     * the size.
4886     */
4887    if ((bytes & 1) || (align == 1))
4888       bit_size = 8;
4889    else if ((bytes & 2) || (align == 2))
4890       bit_size = 16;
4891    else if (bit_size >= 32)
4892       bit_size = 32;
4893 
4894    unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);
4895 
4896    /* Push constants require 32-bit loads. */
4897    if (intrin == nir_intrinsic_load_push_constant) {
4898       if (align_mul >= 4) {
4899          /* If align_mul is bigger than 4 we can use align_offset to find
4900           * the exact number of words we need to read.
4901           */
4902          num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
4903       } else {
4904          /* If bytes is aligned on 32-bit, the access might still cross one
4905           * word at the beginning, and one word at the end. If bytes is not
4906           * aligned on 32-bit, the extra two words should cover for both the
4907           * size and offset mis-alignment.
4908           */
4909          num_comps = (bytes / 4) + 2;
4910       }
4911 
4912       bit_size = MAX2(bit_size, 32);
4913       align = 4;
4914    } else {
4915       align = bit_size / 8;
4916    }
4917 
4918    return (nir_mem_access_size_align){
4919       .num_components = num_comps,
4920       .bit_size = bit_size,
4921       .align = align,
4922       .shift = nir_mem_access_shift_method_scalar,
4923    };
4924 }
4925 
4926 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)4927 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
4928                  unsigned num_components, int64_t hole_size,
4929                  nir_intrinsic_instr *low, nir_intrinsic_instr *high,
4930                  void *data)
4931 {
4932    if (hole_size > 0)
4933       return false;
4934 
4935    /* Must be aligned to the size of the load */
4936    unsigned align = nir_combined_align(align_mul, align_offset);
4937    if ((bit_size / 8) > align)
4938       return false;
4939 
4940    if (num_components > 4)
4941       return false;
4942 
4943    if (bit_size > 32)
4944       return false;
4945 
4946    return true;
4947 }
4948 
4949 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4950 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4951 {
4952    NIR_PASS(_, nir, nir_opt_shrink_stores, true);
4953 
4954    bool progress;
4955 
4956    do {
4957       progress = false;
4958 
4959       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4960       NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4961 
4962       NIR_PASS(progress, nir, nir_copy_prop);
4963       NIR_PASS(progress, nir, nir_opt_remove_phis);
4964       NIR_PASS(progress, nir, nir_opt_dce);
4965       NIR_PASS(progress, nir, nir_opt_dead_cf);
4966       NIR_PASS(progress, nir, nir_opt_cse);
4967       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4968       NIR_PASS(progress, nir, nir_opt_algebraic);
4969       NIR_PASS(progress, nir, nir_opt_constant_folding);
4970 
4971       NIR_PASS(progress, nir, nir_opt_undef);
4972       NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4973 
4974       NIR_PASS(progress, nir, nir_opt_shrink_vectors, false);
4975       NIR_PASS(progress, nir, nir_opt_loop_unroll);
4976    } while (progress);
4977 
4978    NIR_PASS(
4979       progress, nir, nir_opt_load_store_vectorize,
4980       &(const nir_load_store_vectorize_options){
4981          .modes = nir_var_mem_global | nir_var_mem_shared | nir_var_shader_temp,
4982          .callback = mem_vectorize_cb,
4983       });
4984    NIR_PASS(progress, nir, nir_lower_pack);
4985 
4986    /* nir_lower_pack can generate split operations, execute algebraic again to
4987     * handle them */
4988    NIR_PASS(progress, nir, nir_opt_algebraic);
4989 
4990    /* TODO: Why is 64-bit getting rematerialized?
4991     * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4992    NIR_PASS(progress, nir, nir_lower_int64);
4993 
4994    /* We need to cleanup after each iteration of late algebraic
4995     * optimizations, since otherwise NIR can produce weird edge cases
4996     * (like fneg of a constant) which we don't handle */
4997    bool late_algebraic = true;
4998    while (late_algebraic) {
4999       late_algebraic = false;
5000       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
5001       NIR_PASS(progress, nir, nir_opt_constant_folding);
5002       NIR_PASS(progress, nir, nir_copy_prop);
5003       NIR_PASS(progress, nir, nir_opt_dce);
5004       NIR_PASS(progress, nir, nir_opt_cse);
5005    }
5006 
5007    /* This opt currently helps on Bifrost but not Valhall */
5008    if (gpu_id < 0x9000)
5009       NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
5010 
5011    NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5012    NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
5013    NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
5014 
5015    /* Prepass to simplify instruction selection */
5016    late_algebraic = false;
5017    NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
5018 
5019    while (late_algebraic) {
5020       late_algebraic = false;
5021       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
5022       NIR_PASS(progress, nir, nir_opt_constant_folding);
5023       NIR_PASS(progress, nir, nir_copy_prop);
5024       NIR_PASS(progress, nir, nir_opt_dce);
5025       NIR_PASS(progress, nir, nir_opt_cse);
5026    }
5027 
5028    NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
5029    NIR_PASS(progress, nir, nir_opt_dce);
5030 
5031    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5032       NIR_PASS(_, nir, nir_shader_intrinsics_pass,
5033                bifrost_nir_lower_blend_components, nir_metadata_control_flow,
5034                NULL);
5035    }
5036 
5037    /* Backend scheduler is purely local, so do some global optimizations
5038     * to reduce register pressure. */
5039    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
5040                                nir_move_load_input | nir_move_comparisons |
5041                                nir_move_copies | nir_move_load_ssbo;
5042 
5043    NIR_PASS(_, nir, nir_opt_sink, move_all);
5044    NIR_PASS(_, nir, nir_opt_move, move_all);
5045 
5046    /* We might lower attribute, varying, and image indirects. Use the
5047     * gathered info to skip the extra analysis in the happy path. */
5048    bool any_indirects = nir->info.inputs_read_indirectly ||
5049                         nir->info.outputs_accessed_indirectly ||
5050                         nir->info.patch_inputs_read_indirectly ||
5051                         nir->info.patch_outputs_accessed_indirectly ||
5052                         nir->info.images_used[0];
5053 
5054    if (any_indirects) {
5055       nir_divergence_analysis(nir);
5056       NIR_PASS(_, nir, bi_lower_divergent_indirects,
5057                pan_subgroup_size(pan_arch(gpu_id)));
5058    }
5059 }
5060 
5061 static void
bi_opt_post_ra(bi_context * ctx)5062 bi_opt_post_ra(bi_context *ctx)
5063 {
5064    bi_foreach_instr_global_safe(ctx, ins) {
5065       if (ins->op == BI_OPCODE_MOV_I32 &&
5066           bi_is_equiv(ins->dest[0], ins->src[0]))
5067          bi_remove_instruction(ins);
5068    }
5069 }
5070 
5071 /* Dead code elimination for branches at the end of a block - only one branch
5072  * per block is legal semantically, but unreachable jumps can be generated.
5073  * Likewise on Bifrost we can generate jumps to the terminal block which need
5074  * to be lowered away to a jump to #0x0, which induces successful termination.
5075  * That trick doesn't work on Valhall, which needs a NOP inserted in the
5076  * terminal block instead.
5077  */
5078 static void
bi_lower_branch(bi_context * ctx,bi_block * block)5079 bi_lower_branch(bi_context *ctx, bi_block *block)
5080 {
5081    bool cull_terminal = (ctx->arch <= 8);
5082    bool branched = false;
5083 
5084    bi_foreach_instr_in_block_safe(block, ins) {
5085       if (!ins->branch_target)
5086          continue;
5087 
5088       if (branched) {
5089          bi_remove_instruction(ins);
5090          continue;
5091       }
5092 
5093       branched = true;
5094 
5095       if (!bi_is_terminal_block(ins->branch_target))
5096          continue;
5097 
5098       if (cull_terminal)
5099          ins->branch_target = NULL;
5100       else if (ins->branch_target)
5101          ins->branch_target->needs_nop = true;
5102    }
5103 }
5104 
5105 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)5106 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
5107 {
5108    unsigned final_clause = bi_pack(ctx, binary);
5109 
5110    /* If we need to wait for ATEST or BLEND in the first clause, pass the
5111     * corresponding bits through to the renderer state descriptor */
5112    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5113    bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
5114 
5115    unsigned first_deps = first_clause ? first_clause->dependencies : 0;
5116    ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
5117    ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
5118 
5119    /* Pad the shader with enough zero bytes to trick the prefetcher,
5120     * unless we're compiling an empty shader (in which case we don't pad
5121     * so the size remains 0) */
5122    unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
5123 
5124    if (binary->size - offset) {
5125       memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
5126              prefetch_size);
5127    }
5128 }
5129 
5130 /*
5131  * Build a bit mask of varyings (by location) that are flatshaded. This
5132  * information is needed by lower_mediump_io, as we don't yet support 16-bit
5133  * flat varyings.
5134  *
5135  * Also varyings that are used as texture coordinates should be kept at fp32 so
5136  * the texture instruction may be promoted to VAR_TEX. In general this is a good
5137  * idea, as fp16 texture coordinates are not supported by the hardware and are
5138  * usually inappropriate. (There are both relevant CTS bugs here, even.)
5139  *
5140  * TODO: If we compacted the varyings with some fixup code in the vertex shader,
5141  * we could implement 16-bit flat varyings. Consider if this case matters.
5142  *
5143  * TODO: The texture coordinate handling could be less heavyhanded.
5144  */
5145 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)5146 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
5147 {
5148    uint64_t *mask = data;
5149 
5150    if (instr->type != nir_instr_type_tex)
5151       return false;
5152 
5153    nir_tex_instr *tex = nir_instr_as_tex(instr);
5154 
5155    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
5156    if (coord_idx < 0)
5157       return false;
5158 
5159    nir_src src = tex->src[coord_idx].src;
5160    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
5161    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
5162 
5163    if (x.def != y.def)
5164       return false;
5165 
5166    nir_instr *parent = x.def->parent_instr;
5167 
5168    if (parent->type != nir_instr_type_intrinsic)
5169       return false;
5170 
5171    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
5172 
5173    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
5174       return false;
5175 
5176    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
5177    *mask |= BITFIELD64_BIT(sem.location);
5178    return false;
5179 }
5180 
5181 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)5182 bi_fp32_varying_mask(nir_shader *nir)
5183 {
5184    uint64_t mask = 0;
5185 
5186    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
5187 
5188    nir_foreach_shader_in_variable(var, nir) {
5189       if (var->data.interpolation == INTERP_MODE_FLAT)
5190          mask |= BITFIELD64_BIT(var->data.location);
5191    }
5192 
5193    nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
5194                                 &mask);
5195 
5196    return mask;
5197 }
5198 
5199 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)5200 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
5201                             void *data)
5202 {
5203    if (intr->intrinsic != nir_intrinsic_store_output)
5204       return false;
5205 
5206    assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
5207    if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
5208       return false;
5209 
5210    b->cursor = nir_before_instr(&intr->instr);
5211 
5212    nir_def *orig = nir_load_sample_mask(b);
5213 
5214    nir_src_rewrite(&intr->src[0], nir_iand(b, orig, intr->src[0].ssa));
5215    return true;
5216 }
5217 
5218 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)5219 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
5220                      UNUSED void *data)
5221 {
5222    if (intr->intrinsic != nir_intrinsic_load_output)
5223       return false;
5224 
5225    unsigned loc = nir_intrinsic_io_semantics(intr).location;
5226    assert(loc >= FRAG_RESULT_DATA0);
5227    unsigned rt = loc - FRAG_RESULT_DATA0;
5228 
5229    b->cursor = nir_before_instr(&intr->instr);
5230 
5231    nir_def *conversion = nir_load_rt_conversion_pan(
5232       b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
5233 
5234    nir_def *lowered = nir_load_converted_output_pan(
5235       b, intr->def.num_components, intr->def.bit_size, conversion,
5236       .dest_type = nir_intrinsic_dest_type(intr),
5237       .io_semantics = nir_intrinsic_io_semantics(intr));
5238 
5239    nir_def_rewrite_uses(&intr->def, lowered);
5240    return true;
5241 }
5242 
5243 static bool
bi_lower_subgroups(nir_builder * b,nir_intrinsic_instr * intr,void * data)5244 bi_lower_subgroups(nir_builder *b, nir_intrinsic_instr *intr, void *data)
5245 {
5246    unsigned int gpu_id = *(unsigned int *)data;
5247    unsigned int arch = pan_arch(gpu_id);
5248 
5249    b->cursor = nir_before_instr(&intr->instr);
5250 
5251    nir_def *val = NULL;
5252    switch (intr->intrinsic) {
5253    case nir_intrinsic_vote_any:
5254       val = nir_ine_imm(b, nir_ballot(b, 1, 32, intr->src[0].ssa), 0);
5255       break;
5256 
5257    case nir_intrinsic_vote_all:
5258       val = nir_ieq_imm(b, nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa)), 0);
5259       break;
5260 
5261    case nir_intrinsic_load_subgroup_id: {
5262       nir_def *local_id = nir_load_local_invocation_id(b);
5263       nir_def *local_size = nir_load_workgroup_size(b);
5264       /* local_id.x + local_size.x * (local_id.y + local_size.y * local_id.z) */
5265       nir_def *flat_local_id =
5266          nir_iadd(b,
5267             nir_channel(b, local_id, 0),
5268             nir_imul(b,
5269                nir_channel(b, local_size, 0),
5270                nir_iadd(b,
5271                   nir_channel(b, local_id, 1),
5272                   nir_imul(b,
5273                      nir_channel(b, local_size, 1),
5274                      nir_channel(b, local_id, 2)))));
5275       /*
5276        * nir_udiv_imm with a power of two divisor, which pan_subgroup_size is,
5277        * will construct a right shift instead of an udiv.
5278        */
5279       val = nir_udiv_imm(b, flat_local_id, pan_subgroup_size(arch));
5280       break;
5281    }
5282 
5283    case nir_intrinsic_load_subgroup_size:
5284       val = nir_imm_int(b, pan_subgroup_size(arch));
5285       break;
5286 
5287    case nir_intrinsic_load_num_subgroups: {
5288       uint32_t subgroup_size = pan_subgroup_size(arch);
5289       assert(!b->shader->info.workgroup_size_variable);
5290       uint32_t workgroup_size =
5291          b->shader->info.workgroup_size[0] *
5292          b->shader->info.workgroup_size[1] *
5293          b->shader->info.workgroup_size[2];
5294       uint32_t num_subgroups = DIV_ROUND_UP(workgroup_size, subgroup_size);
5295       val = nir_imm_int(b, num_subgroups);
5296       break;
5297    }
5298 
5299    default:
5300       return false;
5301    }
5302 
5303    nir_def_rewrite_uses(&intr->def, val);
5304    return true;
5305 }
5306 
5307 bool
bifrost_nir_lower_load_output(nir_shader * nir)5308 bifrost_nir_lower_load_output(nir_shader *nir)
5309 {
5310    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
5311 
5312    return nir_shader_intrinsics_pass(
5313       nir, bi_lower_load_output,
5314       nir_metadata_control_flow, NULL);
5315 }
5316 
5317 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)5318 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
5319 {
5320    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
5321     * (so we don't accidentally duplicate the epilogue since mesa/st has
5322     * messed with our I/O quite a bit already) */
5323 
5324    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5325 
5326    if (nir->info.stage == MESA_SHADER_VERTEX) {
5327       if (pan_arch(gpu_id) <= 7)
5328          NIR_PASS(_, nir, pan_nir_lower_vertex_id);
5329 
5330       NIR_PASS(_, nir, nir_lower_viewport_transform);
5331       NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
5332 
5333       nir_variable *psiz = nir_find_variable_with_location(
5334          nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
5335       if (psiz != NULL)
5336          psiz->data.precision = GLSL_PRECISION_MEDIUM;
5337    }
5338 
5339    /* Get rid of any global vars before we lower to scratch. */
5340    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
5341 
5342    /* Valhall introduces packed thread local storage, which improves cache
5343     * locality of TLS access. However, access to packed TLS cannot
5344     * straddle 16-byte boundaries. As such, when packed TLS is in use
5345     * (currently unconditional for Valhall), we force vec4 alignment for
5346     * scratch access.
5347     */
5348    glsl_type_size_align_func vars_to_scratch_size_align_func =
5349       (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
5350                          : glsl_get_natural_size_align_bytes;
5351    /* Lower large arrays to scratch and small arrays to bcsel */
5352    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
5353             vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
5354    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
5355 
5356    NIR_PASS(_, nir, nir_split_var_copies);
5357    NIR_PASS(_, nir, nir_lower_var_copies);
5358    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5359    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
5360             glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics);
5361 
5362    if (nir->info.stage == MESA_SHADER_VERTEX)
5363       NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
5364    if (nir->info.stage == MESA_SHADER_FRAGMENT)
5365       NIR_PASS(_, nir, pan_nir_lower_noperspective_fs);
5366 
5367    /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
5368     * offsets it could figure out are constant.  Do some constant folding
5369     * before bifrost_nir_lower_store_component below.
5370     */
5371    NIR_PASS(_, nir, nir_opt_constant_folding);
5372 
5373    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5374       NIR_PASS(_, nir, nir_lower_mediump_io,
5375                nir_var_shader_in | nir_var_shader_out,
5376                ~bi_fp32_varying_mask(nir), false);
5377 
5378       NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
5379                nir_metadata_control_flow, NULL);
5380 
5381       NIR_PASS(_, nir, bifrost_nir_lower_load_output);
5382    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
5383       if (gpu_id >= 0x9000) {
5384          NIR_PASS(_, nir, nir_lower_mediump_io, nir_var_shader_out,
5385                   BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
5386       }
5387 
5388       NIR_PASS(_, nir, pan_nir_lower_store_component);
5389    }
5390 
5391    nir_lower_mem_access_bit_sizes_options mem_size_options = {
5392       .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
5393                nir_var_mem_constant | nir_var_mem_task_payload |
5394                nir_var_shader_temp | nir_var_function_temp |
5395                nir_var_mem_global | nir_var_mem_shared,
5396       .callback = mem_access_size_align_cb,
5397    };
5398    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
5399 
5400    nir_lower_ssbo_options ssbo_opts = {
5401       .native_loads = pan_arch(gpu_id) >= 9,
5402       .native_offset = pan_arch(gpu_id) >= 9,
5403    };
5404    NIR_PASS(_, nir, nir_lower_ssbo, &ssbo_opts);
5405 
5406    NIR_PASS(_, nir, pan_lower_sample_pos);
5407    NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
5408    NIR_PASS(_, nir, nir_lower_64bit_phis);
5409    NIR_PASS(_, nir, pan_lower_helper_invocation);
5410    NIR_PASS(_, nir, nir_lower_int64);
5411 
5412    NIR_PASS(_, nir, nir_opt_idiv_const, 8);
5413    NIR_PASS(_, nir, nir_lower_idiv,
5414             &(nir_lower_idiv_options){.allow_fp16 = true});
5415 
5416    NIR_PASS(_, nir, nir_lower_tex,
5417             &(nir_lower_tex_options){
5418                .lower_txs_lod = true,
5419                .lower_txp = ~0,
5420                .lower_tg4_broadcom_swizzle = true,
5421                .lower_txd_cube_map = true,
5422                .lower_invalid_implicit_lod = true,
5423                .lower_index_to_offset = true,
5424             });
5425 
5426    NIR_PASS(_, nir, nir_lower_image_atomics_to_global);
5427 
5428    /* on bifrost, lower MSAA load/stores to 3D load/stores */
5429    if (pan_arch(gpu_id) < 9)
5430       NIR_PASS(_, nir, pan_nir_lower_image_ms);
5431 
5432    /*
5433     * TODO: we can implement certain operations (notably reductions, scans,
5434     * certain shuffles, etc) more efficiently than nir_lower_subgroups. Moreover
5435     * we can implement reductions and scans on f16vec2 values without splitting
5436     * to scalar first.
5437     */
5438    bool lower_subgroups_progress = false;
5439    NIR_PASS(lower_subgroups_progress, nir, nir_lower_subgroups,
5440       &(nir_lower_subgroups_options) {
5441          .subgroup_size = pan_subgroup_size(pan_arch(gpu_id)),
5442          .ballot_bit_size = 32,
5443          .ballot_components = 1,
5444          .lower_to_scalar = true,
5445          .lower_vote_eq = true,
5446          .lower_vote_bool_eq = true,
5447          .lower_first_invocation_to_ballot = true,
5448          .lower_read_first_invocation = true,
5449          .lower_subgroup_masks = true,
5450          .lower_relative_shuffle = true,
5451          .lower_shuffle = true,
5452          .lower_quad = true,
5453          .lower_quad_broadcast_dynamic = true,
5454          .lower_quad_vote = true,
5455          .lower_elect = true,
5456          .lower_rotate_to_shuffle = true,
5457          .lower_rotate_clustered_to_shuffle = true,
5458          .lower_inverse_ballot = true,
5459          .lower_reduce = true,
5460          .lower_boolean_reduce = true,
5461          .lower_boolean_shuffle = true,
5462       });
5463    /* nir_lower_subgroups creates new vars, clean them up. */
5464    if (lower_subgroups_progress)
5465       NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5466 
5467    NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups,
5468             nir_metadata_control_flow, &gpu_id);
5469 
5470    NIR_PASS(_, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5471    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
5472    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
5473    NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
5474    NIR_PASS(_, nir, nir_lower_var_copies);
5475    NIR_PASS(_, nir, nir_lower_alu);
5476    NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
5477    NIR_PASS(_, nir, pan_nir_lower_frag_coord_zw);
5478 }
5479 
5480 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)5481 bi_compile_variant_nir(nir_shader *nir,
5482                        const struct panfrost_compile_inputs *inputs,
5483                        struct util_dynarray *binary, struct bi_shader_info info,
5484                        enum bi_idvs_mode idvs)
5485 {
5486    bi_context *ctx = rzalloc(NULL, bi_context);
5487 
5488    /* There may be another program in the dynarray, start at the end */
5489    unsigned offset = binary->size;
5490 
5491    ctx->inputs = inputs;
5492    ctx->nir = nir;
5493    ctx->stage = nir->info.stage;
5494    ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
5495    ctx->arch = pan_arch(inputs->gpu_id);
5496    ctx->info = info;
5497    ctx->idvs = idvs;
5498    ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
5499 
5500    if (idvs != BI_IDVS_NONE) {
5501       /* Specializing shaders for IDVS is destructive, so we need to
5502        * clone. However, the last (second) IDVS shader does not need
5503        * to be preserved so we can skip cloning that one.
5504        */
5505       if (offset == 0)
5506          ctx->nir = nir = nir_shader_clone(ctx, nir);
5507 
5508       NIR_PASS(_, nir, nir_shader_instructions_pass,
5509                bifrost_nir_specialize_idvs, nir_metadata_control_flow, &idvs);
5510 
5511       /* After specializing, clean up the mess */
5512       bool progress = true;
5513 
5514       while (progress) {
5515          progress = false;
5516 
5517          NIR_PASS(progress, nir, nir_opt_dce);
5518          NIR_PASS(progress, nir, nir_opt_dead_cf);
5519       }
5520    }
5521 
5522    /* If nothing is pushed, all UBOs need to be uploaded */
5523    ctx->ubo_mask = ~0;
5524 
5525    list_inithead(&ctx->blocks);
5526 
5527    bool skip_internal = nir->info.internal;
5528    skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
5529 
5530    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5531       nir_print_shader(nir, stdout);
5532    }
5533 
5534    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
5535 
5536    nir_foreach_function_impl(impl, nir) {
5537       nir_index_blocks(impl);
5538 
5539       ctx->indexed_nir_blocks =
5540          rzalloc_array(ctx, bi_block *, impl->num_blocks);
5541 
5542       ctx->ssa_alloc += impl->ssa_alloc;
5543 
5544       emit_cf_list(ctx, &impl->body);
5545       bi_emit_phis_deferred(ctx);
5546       break; /* TODO: Multi-function shaders */
5547    }
5548 
5549    /* Index blocks now that we're done emitting */
5550    bi_foreach_block(ctx, block) {
5551       block->index = ctx->num_blocks++;
5552    }
5553 
5554    bi_validate(ctx, "NIR -> BIR");
5555 
5556    /* If the shader doesn't write any colour or depth outputs, it may
5557     * still need an ATEST at the very end! */
5558    bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
5559                            !ctx->emitted_atest && !bi_skip_atest(ctx, false);
5560 
5561    if (need_dummy_atest) {
5562       bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5563       bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5564       bi_emit_atest(&b, bi_zero());
5565    }
5566 
5567    bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5568 
5569    /* Runs before constant folding */
5570    bi_lower_swizzle(ctx);
5571    bi_validate(ctx, "Early lowering");
5572 
5573    /* Runs before copy prop */
5574    if (optimize && !ctx->inputs->no_ubo_to_push) {
5575       bi_opt_push_ubo(ctx);
5576    }
5577 
5578    if (likely(optimize)) {
5579       bi_opt_copy_prop(ctx);
5580 
5581       while (bi_opt_constant_fold(ctx))
5582          bi_opt_copy_prop(ctx);
5583 
5584       bi_opt_mod_prop_forward(ctx);
5585       bi_opt_mod_prop_backward(ctx);
5586 
5587       /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5588        * mod_prop_backward to fuse VAR_TEX */
5589       if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5590           !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5591          bi_opt_dce(ctx, false);
5592          bi_opt_message_preload(ctx);
5593          bi_opt_copy_prop(ctx);
5594       }
5595 
5596       bi_opt_dce(ctx, false);
5597       bi_opt_cse(ctx);
5598       bi_opt_dce(ctx, false);
5599       if (!ctx->inputs->no_ubo_to_push)
5600          bi_opt_reorder_push(ctx);
5601       bi_validate(ctx, "Optimization passes");
5602    }
5603 
5604    bi_lower_opt_instructions(ctx);
5605 
5606    if (ctx->arch >= 9) {
5607       va_optimize(ctx);
5608       va_lower_isel(ctx);
5609 
5610       bi_foreach_instr_global_safe(ctx, I) {
5611          /* Phis become single moves so shouldn't be affected */
5612          if (I->op == BI_OPCODE_PHI)
5613             continue;
5614 
5615          va_lower_constants(ctx, I);
5616 
5617          bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5618          va_repair_fau(&b, I);
5619       }
5620 
5621       /* We need to clean up after constant lowering */
5622       if (likely(optimize)) {
5623          bi_opt_cse(ctx);
5624          bi_opt_dce(ctx, false);
5625       }
5626 
5627       bi_validate(ctx, "Valhall passes");
5628    }
5629 
5630    bi_foreach_block(ctx, block) {
5631       bi_lower_branch(ctx, block);
5632    }
5633 
5634    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5635       bi_print_shader(ctx, stdout);
5636 
5637    /* Analyze before register allocation to avoid false dependencies. The
5638     * skip bit is a function of only the data flow graph and is invariant
5639     * under valid scheduling. Helpers are only defined for fragment
5640     * shaders, so this analysis is only required in fragment shaders.
5641     */
5642    if (ctx->stage == MESA_SHADER_FRAGMENT) {
5643       bi_opt_dce(ctx, false);
5644       bi_analyze_helper_requirements(ctx);
5645    }
5646 
5647    /* Fuse TEXC after analyzing helper requirements so the analysis
5648     * doesn't have to know about dual textures */
5649    if (likely(optimize)) {
5650       bi_opt_fuse_dual_texture(ctx);
5651    }
5652 
5653    /* Lower FAU after fusing dual texture, because fusing dual texture
5654     * creates new immediates that themselves may need lowering.
5655     */
5656    if (ctx->arch <= 8) {
5657       bi_lower_fau(ctx);
5658    }
5659 
5660    /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5661    if (likely(optimize)) {
5662       bi_opt_cse(ctx);
5663       bi_opt_dce(ctx, false);
5664    }
5665 
5666    bi_validate(ctx, "Late lowering");
5667 
5668    if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
5669       bi_pressure_schedule(ctx);
5670       bi_validate(ctx, "Pre-RA scheduling");
5671    }
5672 
5673    bi_register_allocate(ctx);
5674 
5675    if (likely(optimize))
5676       bi_opt_post_ra(ctx);
5677 
5678    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5679       bi_print_shader(ctx, stdout);
5680 
5681    if (ctx->arch >= 9) {
5682       va_assign_slots(ctx);
5683       va_insert_flow_control_nops(ctx);
5684       va_merge_flow(ctx);
5685       va_mark_last(ctx);
5686    } else {
5687       bi_schedule(ctx);
5688       bi_assign_scoreboard(ctx);
5689 
5690       /* Analyze after scheduling since we depend on instruction
5691        * order. Valhall calls as part of va_insert_flow_control_nops,
5692        * as the handling for clauses differs from instructions.
5693        */
5694       bi_analyze_helper_terminate(ctx);
5695       bi_mark_clauses_td(ctx);
5696    }
5697 
5698    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5699       bi_print_shader(ctx, stdout);
5700 
5701    if (ctx->arch <= 8) {
5702       bi_pack_clauses(ctx, binary, offset);
5703    } else {
5704       bi_pack_valhall(ctx, binary);
5705    }
5706 
5707    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5708       if (ctx->arch <= 8) {
5709          disassemble_bifrost(stdout, binary->data + offset,
5710                              binary->size - offset,
5711                              bifrost_debug & BIFROST_DBG_VERBOSE);
5712       } else {
5713          disassemble_valhall(stdout, binary->data + offset,
5714                              binary->size - offset,
5715                              bifrost_debug & BIFROST_DBG_VERBOSE);
5716       }
5717 
5718       fflush(stdout);
5719    }
5720 
5721    if (!skip_internal &&
5722        ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
5723       char *shaderdb;
5724 
5725       if (ctx->arch >= 9) {
5726          shaderdb = va_print_stats(ctx, binary->size - offset);
5727       } else {
5728          shaderdb = bi_print_stats(ctx, binary->size - offset);
5729       }
5730 
5731       if (bifrost_debug & BIFROST_DBG_SHADERDB)
5732          fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
5733 
5734       if (inputs->debug)
5735          util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
5736 
5737       ralloc_free(shaderdb);
5738    }
5739 
5740    return ctx;
5741 }
5742 
5743 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)5744 bi_compile_variant(nir_shader *nir,
5745                    const struct panfrost_compile_inputs *inputs,
5746                    struct util_dynarray *binary, struct pan_shader_info *info,
5747                    enum bi_idvs_mode idvs)
5748 {
5749    struct bi_shader_info local_info = {
5750       .push = &info->push,
5751       .bifrost = &info->bifrost,
5752       .tls_size = info->tls_size,
5753       .push_offset = info->push.count,
5754    };
5755 
5756    unsigned offset = binary->size;
5757 
5758    /* If there is no position shader (gl_Position is not written), then
5759     * there is no need to build a varying shader either. This case is hit
5760     * for transform feedback only vertex shaders which only make sense with
5761     * rasterizer discard.
5762     */
5763    if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5764       return;
5765 
5766    /* Software invariant: Only a secondary shader can appear at a nonzero
5767     * offset, to keep the ABI simple. */
5768    assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5769 
5770    bi_context *ctx =
5771       bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5772 
5773    /* A register is preloaded <==> it is live before the first block */
5774    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5775    uint64_t preload = first_block->reg_live_in;
5776 
5777    /* If multisampling is used with a blend shader, the blend shader needs
5778     * to access the sample coverage mask in r60 and the sample ID in r61.
5779     * Blend shaders run in the same context as fragment shaders, so if a
5780     * blend shader could run, we need to preload these registers
5781     * conservatively. There is believed to be little cost to doing so, so
5782     * do so always to avoid variants of the preload descriptor.
5783     *
5784     * We only do this on Valhall, as Bifrost has to update the RSD for
5785     * multisampling w/ blend shader anyway, so this is handled in the
5786     * driver. We could unify the paths if the cost is acceptable.
5787     */
5788    if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5789       preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5790 
5791    info->ubo_mask |= ctx->ubo_mask;
5792    info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5793 
5794    if (idvs == BI_IDVS_VARYING) {
5795       info->vs.secondary_enable = (binary->size > offset);
5796       info->vs.secondary_offset = offset;
5797       info->vs.secondary_preload = preload;
5798       info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5799    } else {
5800       info->preload = preload;
5801       info->work_reg_count = ctx->info.work_reg_count;
5802    }
5803 
5804    if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5805        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5806       /* Find the psiz write */
5807       bi_instr *write = NULL;
5808 
5809       bi_foreach_instr_global(ctx, I) {
5810          if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5811             write = I;
5812             break;
5813          }
5814       }
5815 
5816       assert(write != NULL);
5817 
5818       /* NOP it out, preserving its flow control. TODO: maybe DCE */
5819       if (write->flow) {
5820          bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5821          bi_instr *nop = bi_nop(&b);
5822          nop->flow = write->flow;
5823       }
5824 
5825       bi_remove_instruction(write);
5826 
5827       info->vs.no_psiz_offset = binary->size;
5828       bi_pack_valhall(ctx, binary);
5829    }
5830 
5831    ralloc_free(ctx);
5832 }
5833 
5834 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5835 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5836 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5837 {
5838    /* Opt-out */
5839    if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5840       return false;
5841 
5842    /* IDVS splits up vertex shaders, not defined on other shader stages */
5843    if (nir->info.stage != MESA_SHADER_VERTEX)
5844       return false;
5845 
5846    /* Bifrost cannot write gl_PointSize during IDVS */
5847    if ((inputs->gpu_id < 0x9000) &&
5848        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5849       return false;
5850 
5851    /* Otherwise, IDVS is usually better */
5852    return true;
5853 }
5854 
5855 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5856 bifrost_compile_shader_nir(nir_shader *nir,
5857                            const struct panfrost_compile_inputs *inputs,
5858                            struct util_dynarray *binary,
5859                            struct pan_shader_info *info)
5860 {
5861    bifrost_debug = debug_get_option_bifrost_debug();
5862 
5863    /* Combine stores late, to give the driver a chance to lower dual-source
5864     * blending as regular store_output intrinsics.
5865     */
5866    NIR_PASS(_, nir, pan_nir_lower_zs_store);
5867 
5868    bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5869 
5870    info->tls_size = nir->scratch_size;
5871    info->vs.idvs = bi_should_idvs(nir, inputs);
5872 
5873    pan_nir_collect_varyings(nir, info);
5874 
5875    if (info->vs.idvs) {
5876       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5877       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5878    } else {
5879       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5880    }
5881 
5882    if (gl_shader_stage_is_compute(nir->info.stage)) {
5883       /* Workgroups may be merged if the structure of the workgroup is
5884        * not software visible. This is true if neither shared memory
5885        * nor barriers are used. The hardware may be able to optimize
5886        * compute shaders that set this flag.
5887        */
5888       info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5889                                           !nir->info.uses_control_barrier &&
5890                                           !nir->info.uses_memory_barrier;
5891    }
5892 
5893    info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5894 }
5895