• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors (Collabora):
25  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26  */
27 
28 #include "compiler/glsl_types.h"
29 #include "compiler/glsl/glsl_to_nir.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32 
33 #include "bifrost/disassemble.h"
34 #include "valhall/disassemble.h"
35 #include "valhall/va_compiler.h"
36 #include "bi_builder.h"
37 #include "bi_quirks.h"
38 #include "bifrost_compile.h"
39 #include "bifrost_nir.h"
40 #include "compiler.h"
41 
42 /* clang-format off */
43 static const struct debug_named_value bifrost_debug_options[] = {
44    {"msgs",       BIFROST_DBG_MSGS,		   "Print debug messages"},
45    {"shaders",    BIFROST_DBG_SHADERS,	   "Dump shaders in NIR and MIR"},
46    {"shaderdb",   BIFROST_DBG_SHADERDB,	"Print statistics"},
47    {"verbose",    BIFROST_DBG_VERBOSE,	   "Disassemble verbosely"},
48    {"internal",   BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
49    {"nosched",    BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
50    {"nopsched",   BIFROST_DBG_NOPSCHED,   "Disable scheduling for pressure"},
51    {"inorder",    BIFROST_DBG_INORDER, 	"Force in-order bundling"},
52    {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
53    {"noopt",      BIFROST_DBG_NOOPT,      "Skip optimization passes"},
54    {"noidvs",     BIFROST_DBG_NOIDVS,     "Disable IDVS"},
55    {"nosb",       BIFROST_DBG_NOSB,       "Disable scoreboarding"},
56    {"nopreload",  BIFROST_DBG_NOPRELOAD,  "Disable message preloading"},
57    {"spill",      BIFROST_DBG_SPILL,      "Test register spilling"},
58    DEBUG_NAMED_VALUE_END
59 };
60 /* clang-format on */
61 
62 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
63                             bifrost_debug_options, 0)
64 
65 /* How many bytes are prefetched by the Bifrost shader core. From the final
66  * clause of the shader, this range must be valid instructions or zero. */
67 #define BIFROST_SHADER_PREFETCH 128
68 
69 int bifrost_debug = 0;
70 
71 #define DBG(fmt, ...)                                                          \
72    do {                                                                        \
73       if (bifrost_debug & BIFROST_DBG_MSGS)                                    \
74          fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__);    \
75    } while (0)
76 
77 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
78 
79 static bi_index
bi_preload(bi_builder * b,unsigned reg)80 bi_preload(bi_builder *b, unsigned reg)
81 {
82    if (bi_is_null(b->shader->preloaded[reg])) {
83       /* Insert at the beginning of the shader */
84       bi_builder b_ = *b;
85       b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
86 
87       /* Cache the result */
88       b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
89    }
90 
91    return b->shader->preloaded[reg];
92 }
93 
94 static bi_index
bi_coverage(bi_builder * b)95 bi_coverage(bi_builder *b)
96 {
97    if (bi_is_null(b->shader->coverage))
98       b->shader->coverage = bi_preload(b, 60);
99 
100    return b->shader->coverage;
101 }
102 
103 /*
104  * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
105  * changed from Bifrost to Valhall. Provide helpers that smooth over the
106  * architectural difference.
107  */
108 static inline bi_index
bi_vertex_id(bi_builder * b)109 bi_vertex_id(bi_builder *b)
110 {
111    return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
112 }
113 
114 static inline bi_index
bi_instance_id(bi_builder * b)115 bi_instance_id(bi_builder *b)
116 {
117    return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
118 }
119 
120 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)121 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
122 {
123    bi_instr *branch = bi_jump(b, bi_zero());
124 
125    switch (instr->type) {
126    case nir_jump_break:
127       branch->branch_target = b->shader->break_block;
128       break;
129    case nir_jump_continue:
130       branch->branch_target = b->shader->continue_block;
131       break;
132    default:
133       unreachable("Unhandled jump type");
134    }
135 
136    bi_block_add_successor(b->shader->current_block, branch->branch_target);
137    b->shader->current_block->unconditional_jumps = true;
138 }
139 
140 /* Builds a 64-bit hash table key for an index */
141 static uint64_t
bi_index_to_key(bi_index idx)142 bi_index_to_key(bi_index idx)
143 {
144    static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
145 
146    uint64_t key = 0;
147    memcpy(&key, &idx, sizeof(idx));
148    return key;
149 }
150 
151 /*
152  * Extract a single channel out of a vector source. We split vectors with SPLIT
153  * so we can use the split components directly, without emitting an extract.
154  * This has advantages of RA, as the split can usually be optimized away.
155  */
156 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)157 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
158 {
159    bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
160                                                       bi_index_to_key(vec));
161 
162    /* No extract needed for scalars.
163     *
164     * This is a bit imprecise, but actual bugs (missing splits for vectors)
165     * should be caught by the following assertion. It is too difficult to
166     * ensure bi_extract is only called for real vectors.
167     */
168    if (components == NULL && channel == 0)
169       return vec;
170 
171    assert(components != NULL && "missing bi_cache_collect()");
172    return components[channel];
173 }
174 
175 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)176 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
177 {
178    /* Lifetime of a hash table entry has to be at least as long as the table */
179    bi_index *channels = ralloc_array(b->shader, bi_index, n);
180    memcpy(channels, s, sizeof(bi_index) * n);
181 
182    _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
183                                channels);
184 }
185 
186 /*
187  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
188  * split pseudo-instruction.
189  *
190  * Pre-condition: dests is filled with bi_null().
191  */
192 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)193 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
194 {
195    /* Setup the destinations */
196    for (unsigned i = 0; i < n; ++i) {
197       dests[i] = bi_temp(b->shader);
198    }
199 
200    /* Emit the split */
201    if (n == 1) {
202       bi_mov_i32_to(b, dests[0], vec);
203    } else {
204       bi_instr *I = bi_split_i32_to(b, n, vec);
205 
206       bi_foreach_dest(I, j)
207          I->dest[j] = dests[j];
208    }
209 }
210 
211 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)212 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
213 {
214    bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
215    bi_emit_split_i32(b, dests, vec, n);
216    bi_cache_collect(b, vec, dests, n);
217 }
218 
219 /*
220  * Emit and cache a split for a vector of a given bitsize. The vector may not be
221  * composed of 32-bit words, but it will be split at 32-bit word boundaries.
222  */
223 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)224 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
225 {
226    bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
227 }
228 
229 static void
bi_split_def(bi_builder * b,nir_def * def)230 bi_split_def(bi_builder *b, nir_def *def)
231 {
232    bi_emit_cached_split(b, bi_def_index(def),
233                         def->bit_size * def->num_components);
234 }
235 
236 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)237 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
238 {
239    /* Special case: COLLECT of a single value is a scalar move */
240    if (n == 1)
241       return bi_mov_i32_to(b, dst, chan[0]);
242 
243    bi_instr *I = bi_collect_i32_to(b, dst, n);
244 
245    bi_foreach_src(I, i)
246       I->src[i] = chan[i];
247 
248    bi_cache_collect(b, dst, chan, n);
249    return I;
250 }
251 
252 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)253 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
254 {
255    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
256 }
257 
258 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)259 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
260                     bi_index s2)
261 {
262    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
263 }
264 
265 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)266 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
267 {
268    bi_index dst = bi_temp(b->shader);
269    bi_collect_v2i32_to(b, dst, s0, s1);
270    return dst;
271 }
272 
273 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)274 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
275 {
276    switch (intr->intrinsic) {
277    case nir_intrinsic_load_barycentric_centroid:
278    case nir_intrinsic_load_barycentric_sample:
279       return bi_preload(b, 61);
280 
281    /* Need to put the sample ID in the top 16-bits */
282    case nir_intrinsic_load_barycentric_at_sample:
283       return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
284                             bi_half(bi_src_index(&intr->src[0]), false));
285 
286    /* Interpret as 8:8 signed fixed point positions in pixels along X and
287     * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
288     * is the center of the pixel so we first fixup and then convert. For
289     * fp16 input:
290     *
291     * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
292     * f2i16((256 * (x, y)) + (128, 128)) =
293     * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
294     *
295     * For fp32 input, that lacks enough precision for MSAA 16x, but the
296     * idea is the same. FIXME: still doesn't pass
297     */
298    case nir_intrinsic_load_barycentric_at_offset: {
299       bi_index offset = bi_src_index(&intr->src[0]);
300       bi_index f16 = bi_null();
301       unsigned sz = nir_src_bit_size(intr->src[0]);
302 
303       if (sz == 16) {
304          f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
305       } else {
306          assert(sz == 32);
307          bi_index f[2];
308          for (unsigned i = 0; i < 2; ++i) {
309             f[i] =
310                bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
311                                   bi_imm_u32(8), BI_SPECIAL_NONE);
312          }
313 
314          f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
315       }
316 
317       return bi_v2f16_to_v2s16(b, f16);
318    }
319 
320    case nir_intrinsic_load_barycentric_pixel:
321    default:
322       return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
323    }
324 }
325 
326 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)327 bi_interp_for_intrinsic(nir_intrinsic_op op)
328 {
329    switch (op) {
330    case nir_intrinsic_load_barycentric_centroid:
331       return BI_SAMPLE_CENTROID;
332    case nir_intrinsic_load_barycentric_sample:
333    case nir_intrinsic_load_barycentric_at_sample:
334       return BI_SAMPLE_SAMPLE;
335    case nir_intrinsic_load_barycentric_at_offset:
336       return BI_SAMPLE_EXPLICIT;
337    case nir_intrinsic_load_barycentric_pixel:
338    default:
339       return BI_SAMPLE_CENTER;
340    }
341 }
342 
343 /* auto, 64-bit omitted */
344 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)345 bi_reg_fmt_for_nir(nir_alu_type T)
346 {
347    switch (T) {
348    case nir_type_float16:
349       return BI_REGISTER_FORMAT_F16;
350    case nir_type_float32:
351       return BI_REGISTER_FORMAT_F32;
352    case nir_type_int16:
353       return BI_REGISTER_FORMAT_S16;
354    case nir_type_uint16:
355       return BI_REGISTER_FORMAT_U16;
356    case nir_type_int32:
357       return BI_REGISTER_FORMAT_S32;
358    case nir_type_uint32:
359       return BI_REGISTER_FORMAT_U32;
360    default:
361       unreachable("Invalid type for register format");
362    }
363 }
364 
365 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
366  * immediate to be used (which applies even if _IMM can't be used) */
367 
368 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)369 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
370                      unsigned max)
371 {
372    nir_src *offset = nir_get_io_offset_src(instr);
373 
374    if (!nir_src_is_const(*offset))
375       return false;
376 
377    *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
378    return (*immediate) < max;
379 }
380 
381 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
382                            unsigned *channel, unsigned count, unsigned bitsize);
383 
384 /* Bifrost's load instructions lack a component offset despite operating in
385  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
386  * but they may be unavoidable with separate shaders in use. To solve this, we
387  * lower to a larger load and an explicit copy of the desired components. */
388 
389 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)390 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
391 {
392    unsigned component = nir_intrinsic_component(instr);
393    unsigned nr = instr->num_components;
394    unsigned total = nr + component;
395    unsigned bitsize = instr->def.bit_size;
396 
397    assert(total <= 4 && "should be vec4");
398    bi_emit_cached_split(b, tmp, total * bitsize);
399 
400    if (component == 0)
401       return;
402 
403    bi_index srcs[] = {tmp, tmp, tmp};
404    unsigned channels[] = {component, component + 1, component + 2};
405 
406    bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
407                   instr->def.bit_size);
408 }
409 
410 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)411 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
412 {
413    /* Disregard the signedness of an integer, since loading 32-bits into a
414     * 32-bit register should be bit exact so should not incur any clamping.
415     *
416     * If we are reading as a u32, then it must be paired with an integer (u32 or
417     * s32) source, so use .auto32 to disregard.
418     */
419    nir_alu_type T = nir_intrinsic_dest_type(instr);
420    assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
421    enum bi_register_format regfmt =
422       T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
423 
424    nir_src *offset = nir_get_io_offset_src(instr);
425    unsigned component = nir_intrinsic_component(instr);
426    enum bi_vecsize vecsize = (instr->num_components + component - 1);
427    unsigned imm_index = 0;
428    unsigned base = nir_intrinsic_base(instr);
429    bool constant = nir_src_is_const(*offset);
430    bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
431    bi_index dest =
432       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
433    bi_instr *I;
434 
435    if (immediate) {
436       I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt,
437                             vecsize, imm_index);
438    } else {
439       bi_index idx = bi_src_index(&instr->src[0]);
440 
441       if (constant)
442          idx = bi_imm_u32(imm_index);
443       else if (base != 0)
444          idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
445 
446       I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx,
447                         regfmt, vecsize);
448    }
449 
450    if (b->shader->arch >= 9)
451       I->table = PAN_TABLE_ATTRIBUTE;
452 
453    bi_copy_component(b, instr, dest);
454 }
455 
456 /*
457  * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
458  * come later, sparsely packed. This handles both linked and separable shaders
459  * with a common code path, with minimal keying only for desktop GL. Each slot
460  * consumes 16 bytes (TODO: fp16, partial vectors).
461  */
462 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)463 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
464 {
465    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
466    uint32_t mask = ctx->inputs->fixed_varying_mask;
467 
468    if (sem.location >= VARYING_SLOT_VAR0) {
469       unsigned nr_special = util_bitcount(mask);
470       unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
471 
472       return 16 * (nr_special + general_index);
473    } else {
474       return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
475    }
476 }
477 
478 /*
479  * Compute the offset in bytes of a varying with an immediate offset, adding the
480  * offset to the base computed above. Convenience method.
481  */
482 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)483 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
484 {
485    nir_src *src = nir_get_io_offset_src(intr);
486    assert(nir_src_is_const(*src) && "assumes immediate offset");
487 
488    return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
489 }
490 
491 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)492 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
493 {
494    enum bi_sample sample = BI_SAMPLE_CENTER;
495    enum bi_update update = BI_UPDATE_STORE;
496    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
497    bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
498    bi_index src0 = bi_null();
499 
500    unsigned component = nir_intrinsic_component(instr);
501    enum bi_vecsize vecsize = (instr->num_components + component - 1);
502    bi_index dest =
503       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
504 
505    unsigned sz = instr->def.bit_size;
506 
507    if (smooth) {
508       nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
509       assert(parent);
510 
511       sample = bi_interp_for_intrinsic(parent->intrinsic);
512       src0 = bi_varying_src0_for_barycentric(b, parent);
513 
514       assert(sz == 16 || sz == 32);
515       regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
516    } else {
517       assert(sz == 32);
518       regfmt = BI_REGISTER_FORMAT_U32;
519 
520       /* Valhall can't have bi_null() here, although the source is
521        * logically unused for flat varyings
522        */
523       if (b->shader->arch >= 9)
524          src0 = bi_preload(b, 61);
525 
526       /* Gather info as we go */
527       b->shader->info.bifrost->uses_flat_shading = true;
528    }
529 
530    enum bi_source_format source_format =
531       smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
532 
533    nir_src *offset = nir_get_io_offset_src(instr);
534    unsigned imm_index = 0;
535    bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
536    bi_instr *I = NULL;
537 
538    if (b->shader->malloc_idvs && immediate) {
539       /* Immediate index given in bytes. */
540       bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
541                            update, vecsize,
542                            bi_varying_offset(b->shader, instr));
543    } else if (immediate && smooth) {
544       I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
545                            imm_index);
546    } else if (immediate && !smooth) {
547       I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
548                                 imm_index);
549    } else {
550       bi_index idx = bi_src_index(offset);
551       unsigned base = nir_intrinsic_base(instr);
552 
553       if (b->shader->malloc_idvs) {
554          /* Index needs to be in bytes, but NIR gives the index
555           * in slots. For now assume 16 bytes per element.
556           */
557          bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
558          unsigned vbase = bi_varying_base_bytes(b->shader, instr);
559 
560          if (vbase != 0)
561             idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
562 
563          bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
564                           source_format, update, vecsize);
565       } else if (smooth) {
566          if (base != 0)
567             idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
568 
569          I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
570       } else {
571          if (base != 0)
572             idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
573 
574          I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
575       }
576    }
577 
578    /* Valhall usually uses machine-allocated IDVS. If this is disabled, use
579     * a simple Midgard-style ABI.
580     */
581    if (b->shader->arch >= 9 && I != NULL)
582       I->table = PAN_TABLE_ATTRIBUTE;
583 
584    bi_copy_component(b, instr, dest);
585 }
586 
587 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)588 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
589                     unsigned count)
590 {
591    assert(1 <= count && count <= 4);
592 
593    bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
594 
595    for (unsigned i = 0; i < count; ++i) {
596       unsigned chan = channel ? channel[i] : 0;
597 
598       bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3);
599    }
600 
601    if (b->shader->arch >= 9) {
602       bi_index vec = bi_zero();
603 
604       if (count >= 3)
605          vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
606 
607       return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
608    } else {
609       return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
610    }
611 }
612 
613 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)614 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
615                      unsigned count)
616 {
617    unsigned chan0 = channel ? channel[0] : 0;
618    bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
619    bi_index h0 = bi_half(w0, chan0 & 1);
620 
621    /* Zero extend */
622    if (count == 1)
623       return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
624 
625    /* Else, create a vector */
626    assert(count == 2);
627 
628    unsigned chan1 = channel ? channel[1] : 0;
629    bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
630    bi_index h1 = bi_half(w1, chan1 & 1);
631 
632    if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
633       return bi_mov_i32(b, w0);
634    else if (bi_is_word_equiv(w0, w1))
635       return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
636    else
637       return bi_mkvec_v2i16(b, h0, h1);
638 }
639 
640 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)641 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
642                unsigned count, unsigned bitsize)
643 {
644    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
645    unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
646    unsigned chan_per_word = 1 << shift;
647 
648    assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
649           "unnecessarily large vector should have been lowered");
650 
651    bi_index srcs[BI_MAX_VEC];
652 
653    for (unsigned i = 0; i < count; i += chan_per_word) {
654       unsigned rem = MIN2(count - i, chan_per_word);
655       unsigned *channel_offset = channel ? (channel + i) : NULL;
656 
657       if (bitsize == 32)
658          srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
659       else if (bitsize == 16)
660          srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
661       else
662          srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
663    }
664 
665    bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
666 }
667 
668 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)669 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
670                bi_index src1)
671 {
672    bi_instr *I;
673 
674    if (b->shader->arch >= 9) {
675       I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
676       I->seg = BI_SEG_UBO;
677    } else {
678       I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
679    }
680 
681    bi_emit_cached_split(b, dest0, bitsize);
682    return I;
683 }
684 
685 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)686 bi_load_sample_id_to(bi_builder *b, bi_index dst)
687 {
688    /* r61[16:23] contains the sampleID, mask it out. Upper bits
689     * seem to read garbage (despite being architecturally defined
690     * as zero), so use a 5-bit mask instead of 8-bits */
691 
692    bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
693                         bi_imm_u8(16), false);
694 }
695 
696 static bi_index
bi_load_sample_id(bi_builder * b)697 bi_load_sample_id(bi_builder *b)
698 {
699    bi_index sample_id = bi_temp(b->shader);
700    bi_load_sample_id_to(b, sample_id);
701    return sample_id;
702 }
703 
704 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)705 bi_pixel_indices(bi_builder *b, unsigned rt)
706 {
707    /* We want to load the current pixel. */
708    struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
709 
710    uint32_t indices_u32 = 0;
711    memcpy(&indices_u32, &pix, sizeof(indices_u32));
712    bi_index indices = bi_imm_u32(indices_u32);
713 
714    /* Sample index above is left as zero. For multisampling, we need to
715     * fill in the actual sample ID in the lower byte */
716 
717    if (b->shader->inputs->blend.nr_samples > 1)
718       indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
719 
720    return indices;
721 }
722 
723 /* Source color is passed through r0-r3, or r4-r7 for the second source when
724  * dual-source blending. Preload the corresponding vector.
725  */
726 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)727 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
728 {
729    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
730    unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
731    unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
732    assert(size == 16 || size == 32);
733 
734    bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
735                       bi_preload(b, base + 2), bi_preload(b, base + 3)};
736 
737    bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
738 }
739 
740 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)741 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
742                  nir_alu_type T2, unsigned rt)
743 {
744    /* Reads 2 or 4 staging registers to cover the input */
745    unsigned size = nir_alu_type_get_type_size(T);
746    unsigned size_2 = nir_alu_type_get_type_size(T2);
747    unsigned sr_count = (size <= 16) ? 2 : 4;
748    unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
749    const struct panfrost_compile_inputs *inputs = b->shader->inputs;
750    uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
751    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
752 
753    /* Workaround for NIR-to-TGSI */
754    if (b->shader->nir->info.fs.untyped_color_outputs)
755       regfmt = BI_REGISTER_FORMAT_AUTO;
756 
757    if (inputs->is_blend && inputs->blend.nr_samples > 1) {
758       /* Conversion descriptor comes from the compile inputs, pixel
759        * indices derived at run time based on sample ID */
760       bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
761                  bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
762    } else if (b->shader->inputs->is_blend) {
763       uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
764 
765       /* Blend descriptor comes from the compile inputs */
766       /* Put the result in r0 */
767 
768       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
769                   bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
770                   bi_null(), regfmt, sr_count, 0);
771    } else {
772       /* Blend descriptor comes from the FAU RAM. By convention, the
773        * return address on Bifrost is stored in r48 and will be used
774        * by the blend shader to jump back to the fragment shader */
775 
776       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
777                   bi_fau(BIR_FAU_BLEND_0 + rt, false),
778                   bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
779                   sr_count_2);
780    }
781 
782    assert(rt < 8);
783    b->shader->info.bifrost->blend[rt].type = T;
784 
785    if (T2)
786       b->shader->info.bifrost->blend_src1_type = T2;
787 }
788 
789 /* Blend shaders do not need to run ATEST since they are dependent on a
790  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
791  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
792  * there are no writes to the coverage mask. The latter two are satisfied for
793  * all blit shaders, so we just care about early-z, which blit shaders force
794  * iff they do not write depth or stencil */
795 
796 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)797 bi_skip_atest(bi_context *ctx, bool emit_zs)
798 {
799    return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
800 }
801 
802 static void
bi_emit_atest(bi_builder * b,bi_index alpha)803 bi_emit_atest(bi_builder *b, bi_index alpha)
804 {
805    b->shader->coverage =
806       bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
807    b->shader->emitted_atest = true;
808 }
809 
810 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)811 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
812 {
813    bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
814 
815    unsigned writeout =
816       combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
817 
818    bool emit_blend = writeout & (PAN_WRITEOUT_C);
819    bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
820 
821    unsigned loc = nir_intrinsic_io_semantics(instr).location;
822    bi_index src0 = bi_src_index(&instr->src[0]);
823 
824    /* By ISA convention, the coverage mask is stored in R60. The store
825     * itself will be handled by a subsequent ATEST instruction */
826    if (loc == FRAG_RESULT_SAMPLE_MASK) {
827       b->shader->coverage = bi_extract(b, src0, 0);
828       return;
829    }
830 
831    /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
832     * value, but render target #0 might not be floating point. However the
833     * alpha value is only used for alpha-to-coverage, a stage which is
834     * skipped for pure integer framebuffers, so the issue is moot. */
835 
836    if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
837       nir_alu_type T = nir_intrinsic_src_type(instr);
838 
839       bi_index rgba = bi_src_index(&instr->src[0]);
840       bi_index alpha = (T == nir_type_float16)
841                           ? bi_half(bi_extract(b, rgba, 1), true)
842                        : (T == nir_type_float32) ? bi_extract(b, rgba, 3)
843                                                  : bi_dontcare(b);
844 
845       /* Don't read out-of-bounds */
846       if (nir_src_num_components(instr->src[0]) < 4)
847          alpha = bi_imm_f32(1.0);
848 
849       bi_emit_atest(b, alpha);
850    }
851 
852    if (emit_zs) {
853       bi_index z = bi_dontcare(b), s = bi_dontcare(b);
854 
855       if (writeout & PAN_WRITEOUT_Z)
856          z = bi_src_index(&instr->src[2]);
857 
858       if (writeout & PAN_WRITEOUT_S)
859          s = bi_src_index(&instr->src[3]);
860 
861       b->shader->coverage =
862          bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
863                     writeout & PAN_WRITEOUT_Z);
864    }
865 
866    if (emit_blend) {
867       unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
868       bool dual = (writeout & PAN_WRITEOUT_2);
869       bi_index color = bi_src_index(&instr->src[0]);
870       bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
871       nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
872 
873       /* Explicit copy since BLEND inputs are precoloured to R0-R3,
874        * TODO: maybe schedule around this or implement in RA as a
875        * spill */
876       bool has_mrt =
877          (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
878 
879       if (has_mrt) {
880          bi_index srcs[4] = {color, color, color, color};
881          unsigned channels[4] = {0, 1, 2, 3};
882          color = bi_temp(b->shader);
883          bi_make_vec_to(
884             b, color, srcs, channels, nir_src_num_components(instr->src[0]),
885             nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
886       }
887 
888       bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
889    }
890 
891    if (b->shader->inputs->is_blend) {
892       /* Jump back to the fragment shader, return address is stored
893        * in r48 (see above). On Valhall, only jump if the address is
894        * nonzero. The check is free there and it implements the "jump
895        * to 0 terminates the blend shader" that's automatic on
896        * Bifrost.
897        */
898       if (b->shader->arch >= 8)
899          bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
900       else
901          bi_jump(b, bi_preload(b, 48));
902    }
903 }
904 
905 /**
906  * In a vertex shader, is the specified variable a position output? These kinds
907  * of outputs are written from position shaders when IDVS is enabled. All other
908  * outputs are written from the varying shader.
909  */
910 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)911 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
912 {
913    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
914 
915    switch (sem.location) {
916    case VARYING_SLOT_POS:
917    case VARYING_SLOT_PSIZ:
918       return idvs == BI_IDVS_VARYING;
919    default:
920       return idvs == BI_IDVS_POSITION;
921    }
922 }
923 
924 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)925 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
926 {
927    enum bi_idvs_mode *idvs = data;
928 
929    if (instr->type != nir_instr_type_intrinsic)
930       return false;
931 
932    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
933 
934    if (intr->intrinsic != nir_intrinsic_store_output)
935       return false;
936 
937    if (bi_should_remove_store(intr, *idvs)) {
938       nir_instr_remove(instr);
939       return true;
940    }
941 
942    return false;
943 }
944 
945 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)946 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
947 {
948    /* In principle we can do better for 16-bit. At the moment we require
949     * 32-bit to permit the use of .auto, in order to force .u32 for flat
950     * varyings, to handle internal TGSI shaders that set flat in the VS
951     * but smooth in the FS */
952 
953    ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
954    ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
955    assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
956    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
957 
958    unsigned imm_index = 0;
959    bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
960 
961    /* Only look at the total components needed. In effect, we fill in all
962     * the intermediate "holes" in the write mask, since we can't mask off
963     * stores. Since nir_lower_io_to_temporaries ensures each varying is
964     * written at most once, anything that's masked out is undefined, so it
965     * doesn't matter what we write there. So we may as well do the
966     * simplest thing possible. */
967    unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
968    assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
969 
970    bi_index data = bi_src_index(&instr->src[0]);
971 
972    /* To keep the vector dimensions consistent, we need to drop some
973     * components. This should be coalesced.
974     *
975     * TODO: This is ugly and maybe inefficient. Would we rather
976     * introduce a TRIM.i32 pseudoinstruction?
977     */
978    if (nr < nir_intrinsic_src_components(instr, 0)) {
979       assert(T_size == 32 && "todo: 16-bit trim");
980 
981       bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
982       unsigned src_comps = nir_intrinsic_src_components(instr, 0);
983 
984       bi_emit_split_i32(b, chans, data, src_comps);
985 
986       bi_index tmp = bi_temp(b->shader);
987       bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
988 
989       bi_foreach_src(collect, w)
990          collect->src[w] = chans[w];
991 
992       data = tmp;
993    }
994 
995    bool psiz =
996       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
997 
998    bi_index a[4] = {bi_null()};
999 
1000    if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1001       /* Bifrost position shaders have a fast path */
1002       assert(T == nir_type_float16 || T == nir_type_float32);
1003       unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1004       unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1005       unsigned snap4 = 0x5E;
1006       uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1007 
1008       bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1009                 bi_imm_u32(format), regfmt, nr - 1);
1010    } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1011       bi_index index = bi_preload(b, 59);
1012 
1013       if (psiz) {
1014          assert(T_size == 16 && "should've been lowered");
1015          index = bi_iadd_imm_i32(b, index, 4);
1016       }
1017 
1018       bi_index address = bi_lea_buf_imm(b, index);
1019       bi_emit_split_i32(b, a, address, 2);
1020 
1021       bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1022 
1023       bi_store(b, nr * nir_src_bit_size(instr->src[0]), data, a[0], a[1],
1024                varying ? BI_SEG_VARY : BI_SEG_POS,
1025                varying ? bi_varying_offset(b->shader, instr) : 0);
1026    } else if (immediate) {
1027       bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1028                                          regfmt, imm_index);
1029       bi_emit_split_i32(b, a, address, 3);
1030 
1031       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1032    } else {
1033       bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1034                                  bi_imm_u32(nir_intrinsic_base(instr)), false);
1035       bi_index address =
1036          bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1037       bi_emit_split_i32(b, a, address, 3);
1038 
1039       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1040    }
1041 }
1042 
1043 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1044 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1045 {
1046    nir_src *offset = nir_get_io_offset_src(instr);
1047 
1048    bool offset_is_const = nir_src_is_const(*offset);
1049    bi_index dyn_offset = bi_src_index(offset);
1050    uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1051 
1052    bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1053                   bi_def_index(&instr->def),
1054                   offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1055                   bi_src_index(&instr->src[0]));
1056 }
1057 
1058 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1059 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1060 {
1061    assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1062 
1063    nir_src *offset = &instr->src[0];
1064    assert(nir_src_is_const(*offset) && "no indirect push constants");
1065    uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1066    assert((base & 3) == 0 && "unaligned push constants");
1067 
1068    unsigned bits = instr->def.bit_size * instr->def.num_components;
1069 
1070    unsigned n = DIV_ROUND_UP(bits, 32);
1071    assert(n <= 4);
1072    bi_index channels[4] = {bi_null()};
1073 
1074    for (unsigned i = 0; i < n; ++i) {
1075       unsigned word = (base >> 2) + i;
1076 
1077       channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1078    }
1079 
1080    bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1081 }
1082 
1083 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1084 bi_addr_high(bi_builder *b, nir_src *src)
1085 {
1086    return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1087                                          : bi_zero();
1088 }
1089 
1090 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1091 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1092                   enum bi_seg seg, int16_t *offset)
1093 {
1094    /* Not needed on Bifrost or for global accesses */
1095    if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1096       return;
1097 
1098    /* There is no segment modifier on Valhall. Instead, we need to
1099     * emit the arithmetic ourselves. We do have an offset
1100     * available, which saves an instruction for constant offsets.
1101     */
1102    bool wls = (seg == BI_SEG_WLS);
1103    assert(wls || (seg == BI_SEG_TL));
1104 
1105    enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1106 
1107    bi_index base_lo = bi_fau(fau, false);
1108 
1109    if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1110        addr_lo->value == (int16_t)addr_lo->value) {
1111       *offset = addr_lo->value;
1112       *addr_lo = base_lo;
1113    } else {
1114       *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1115    }
1116 
1117    /* Do not allow overflow for WLS or TLS */
1118    *addr_hi = bi_fau(fau, true);
1119 }
1120 
1121 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1122 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1123 {
1124    int16_t offset = 0;
1125    unsigned bits = instr->num_components * instr->def.bit_size;
1126    bi_index dest = bi_def_index(&instr->def);
1127    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1128    bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1129 
1130    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1131 
1132    bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1133    bi_emit_cached_split(b, dest, bits);
1134 }
1135 
1136 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1137 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1138 {
1139    /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1140    assert(nir_intrinsic_write_mask(instr) ==
1141           BITFIELD_MASK(instr->num_components));
1142 
1143    int16_t offset = 0;
1144    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1145    bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1146 
1147    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1148 
1149    bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1150             bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1151 }
1152 
1153 /* Exchanges the staging register with memory */
1154 
1155 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1156 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1157                  enum bi_seg seg)
1158 {
1159    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1160 
1161    unsigned sz = nir_src_bit_size(*arg);
1162    assert(sz == 32 || sz == 64);
1163 
1164    bi_index data = bi_src_index(arg);
1165 
1166    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1167 
1168    if (b->shader->arch >= 9)
1169       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1170    else if (seg == BI_SEG_WLS)
1171       addr_hi = bi_zero();
1172 
1173    bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1174 }
1175 
1176 /* Exchanges the second staging register with memory if comparison with first
1177  * staging register passes */
1178 
1179 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1180 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1181                     nir_src *arg_2, enum bi_seg seg)
1182 {
1183    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1184 
1185    /* hardware is swapped from NIR */
1186    bi_index src0 = bi_src_index(arg_2);
1187    bi_index src1 = bi_src_index(arg_1);
1188 
1189    unsigned sz = nir_src_bit_size(*arg_1);
1190    assert(sz == 32 || sz == 64);
1191 
1192    bi_index data_words[] = {
1193       bi_extract(b, src0, 0),
1194       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1195 
1196       /* 64-bit */
1197       bi_extract(b, src1, 0),
1198       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1199    };
1200 
1201    bi_index in = bi_temp(b->shader);
1202    bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1203    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1204 
1205    if (b->shader->arch >= 9)
1206       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1207    else if (seg == BI_SEG_WLS)
1208       addr_hi = bi_zero();
1209 
1210    bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1211    bi_emit_cached_split(b, out, sz);
1212 
1213    bi_index inout_words[] = {bi_extract(b, out, 0),
1214                              sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1215 
1216    bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1217 }
1218 
1219 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1220 bi_atom_opc_for_nir(nir_atomic_op op)
1221 {
1222    /* clang-format off */
1223    switch (op) {
1224    case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1225    case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1226    case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1227    case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1228    case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1229    case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1230    case nir_atomic_op_ior:  return BI_ATOM_OPC_AOR;
1231    case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1232    default: unreachable("Unexpected computational atomic");
1233    }
1234    /* clang-format on */
1235 }
1236 
1237 /* Optimized unary atomics are available with an implied #1 argument */
1238 
1239 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1240 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1241 {
1242    /* Check we have a compatible constant */
1243    if (arg.type != BI_INDEX_CONSTANT)
1244       return false;
1245 
1246    if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1247       return false;
1248 
1249    /* Check for a compatible operation */
1250    switch (op) {
1251    case BI_ATOM_OPC_AADD:
1252       *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1253       return true;
1254    case BI_ATOM_OPC_ASMAX:
1255       *out = BI_ATOM_OPC_ASMAX1;
1256       return true;
1257    case BI_ATOM_OPC_AUMAX:
1258       *out = BI_ATOM_OPC_AUMAX1;
1259       return true;
1260    case BI_ATOM_OPC_AOR:
1261       *out = BI_ATOM_OPC_AOR1;
1262       return true;
1263    default:
1264       return false;
1265    }
1266 }
1267 
1268 /*
1269  * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1270  * translate between these forms (with MKVEC.v2i16).
1271  *
1272  * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1273  * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
1274  *
1275  * Our indexing needs to reflects this.
1276  */
1277 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array)1278 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1279                     unsigned coord_comps, bool is_array)
1280 {
1281    assert(coord_comps > 0 && coord_comps <= 3);
1282 
1283    if (src_idx == 0) {
1284       if (coord_comps == 1 || (coord_comps == 2 && is_array))
1285          return bi_extract(b, coord, 0);
1286       else
1287          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1288                                bi_half(bi_extract(b, coord, 1), false));
1289    } else {
1290       if (coord_comps == 3 && b->shader->arch >= 9)
1291          return bi_mkvec_v2i16(b, bi_imm_u16(0),
1292                                bi_half(bi_extract(b, coord, 2), false));
1293       else if (coord_comps == 2 && is_array && b->shader->arch >= 9)
1294          return bi_mkvec_v2i16(b, bi_imm_u16(0),
1295                                bi_half(bi_extract(b, coord, 1), false));
1296       else if (coord_comps == 3)
1297          return bi_extract(b, coord, 2);
1298       else if (coord_comps == 2 && is_array)
1299          return bi_extract(b, coord, 1);
1300       else
1301          return bi_zero();
1302    }
1303 }
1304 
1305 static bi_index
bi_emit_image_index(bi_builder * b,nir_intrinsic_instr * instr)1306 bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
1307 {
1308    nir_src src = instr->src[0];
1309    bi_index index = bi_src_index(&src);
1310    bi_context *ctx = b->shader;
1311 
1312    /* Images come after vertex attributes, so handle an explicit offset */
1313    unsigned offset = (ctx->stage == MESA_SHADER_VERTEX)
1314                         ? util_bitcount64(ctx->nir->info.inputs_read)
1315                         : 0;
1316 
1317    if (offset == 0)
1318       return index;
1319    else if (nir_src_is_const(src))
1320       return bi_imm_u32(nir_src_as_uint(src) + offset);
1321    else
1322       return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
1323 }
1324 
1325 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1326 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1327 {
1328    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1329    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1330    bool array = nir_intrinsic_image_array(instr);
1331 
1332    bi_index coords = bi_src_index(&instr->src[1]);
1333    bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1334    bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1335    bi_index dest = bi_def_index(&instr->def);
1336    enum bi_register_format regfmt =
1337       bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1338    enum bi_vecsize vecsize = instr->num_components - 1;
1339 
1340    assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd image not lowered");
1341 
1342    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1343       bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
1344                                      nir_src_as_uint(instr->src[0]));
1345 
1346       I->table = PAN_TABLE_IMAGE;
1347    } else if (b->shader->arch >= 9) {
1348       unreachable("Indirect images on Valhall not yet supported");
1349    } else {
1350       bi_ld_attr_tex_to(b, dest, xy, zw, bi_emit_image_index(b, instr), regfmt,
1351                         vecsize);
1352    }
1353 
1354    bi_split_def(b, &instr->def);
1355 }
1356 
1357 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1358 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1359 {
1360    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1361    bool array = nir_intrinsic_image_array(instr);
1362    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1363 
1364    assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd image not lowered");
1365 
1366    enum bi_register_format type =
1367       (instr->intrinsic == nir_intrinsic_image_store)
1368          ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1369          : BI_REGISTER_FORMAT_AUTO;
1370 
1371    bi_index coords = bi_src_index(&instr->src[1]);
1372    bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1373    bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1374 
1375    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1376       bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
1377                                       nir_src_as_uint(instr->src[0]));
1378 
1379       I->table = PAN_TABLE_IMAGE;
1380    } else if (b->shader->arch >= 9) {
1381       unreachable("Indirect images on Valhall not yet supported");
1382    } else {
1383       bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1384                                        bi_emit_image_index(b, instr), type);
1385 
1386       /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1387        * our ABI has all images in the primary attribute table
1388        */
1389       I->table = BI_TABLE_ATTRIBUTE_1;
1390    }
1391 
1392    bi_emit_cached_split(b, dest, 3 * 32);
1393 }
1394 
1395 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1396 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1397 {
1398    bi_index dest = bi_temp(b->shader);
1399    bi_emit_lea_image_to(b, dest, instr);
1400    return dest;
1401 }
1402 
1403 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1404 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1405 {
1406    bi_index a[4] = {bi_null()};
1407    bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1408 
1409    /* Due to SPIR-V limitations, the source type is not fully reliable: it
1410     * reports uint32 even for write_imagei. This causes an incorrect
1411     * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1412     * instead, which will match per the OpenCL spec. Of course this does
1413     * not work for 16-bit stores, but those are not available in OpenCL.
1414     */
1415    nir_alu_type T = nir_intrinsic_src_type(instr);
1416    assert(nir_alu_type_get_type_size(T) == 32);
1417 
1418    bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1419              BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1420 }
1421 
1422 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1423 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1424                       nir_atomic_op op)
1425 {
1426    enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1427    enum bi_atom_opc post_opc = opc;
1428    bool bifrost = b->shader->arch <= 8;
1429 
1430    /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1431     * take any vector but can still output in RETURN mode */
1432    bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1433    unsigned sr_count = bifrost ? 2 : 1;
1434 
1435    /* Generate either ATOM or ATOM1 as required */
1436    if (bi_promote_atom_c1(opc, arg, &opc)) {
1437       bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1438                              bi_extract(b, addr, 1), opc, sr_count);
1439    } else {
1440       bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1441                             bi_extract(b, addr, 1), opc, sr_count);
1442    }
1443 
1444    if (bifrost) {
1445       /* Post-process it */
1446       bi_emit_cached_split_i32(b, tmp_dest, 2);
1447       bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1448                           bi_extract(b, tmp_dest, 1), post_opc);
1449    }
1450 }
1451 
1452 static void
bi_emit_load_frag_coord_zw(bi_builder * b,bi_index dst,unsigned channel)1453 bi_emit_load_frag_coord_zw(bi_builder *b, bi_index dst, unsigned channel)
1454 {
1455    bi_ld_var_special_to(
1456       b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1457       BI_UPDATE_CLOBBER,
1458       (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1459       BI_VECSIZE_NONE);
1460 }
1461 
1462 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1463 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1464 {
1465    bi_index dest = bi_def_index(&instr->def);
1466    nir_alu_type T = nir_intrinsic_dest_type(instr);
1467    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1468    unsigned size = instr->def.bit_size;
1469    unsigned nr = instr->num_components;
1470 
1471    /* Get the render target */
1472    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1473    unsigned loc = sem.location;
1474    assert(loc >= FRAG_RESULT_DATA0);
1475    unsigned rt = (loc - FRAG_RESULT_DATA0);
1476 
1477    bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1478                  bi_src_index(&instr->src[0]), regfmt, nr - 1);
1479    bi_emit_cached_split(b, dest, size * nr);
1480 }
1481 
1482 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1483 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1484 {
1485    bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1486                      ? bi_def_index(&instr->def)
1487                      : bi_null();
1488    gl_shader_stage stage = b->shader->stage;
1489 
1490    switch (instr->intrinsic) {
1491    case nir_intrinsic_load_barycentric_pixel:
1492    case nir_intrinsic_load_barycentric_centroid:
1493    case nir_intrinsic_load_barycentric_sample:
1494    case nir_intrinsic_load_barycentric_at_sample:
1495    case nir_intrinsic_load_barycentric_at_offset:
1496       /* handled later via load_vary */
1497       break;
1498    case nir_intrinsic_load_interpolated_input:
1499    case nir_intrinsic_load_input:
1500       if (b->shader->inputs->is_blend)
1501          bi_emit_load_blend_input(b, instr);
1502       else if (stage == MESA_SHADER_FRAGMENT)
1503          bi_emit_load_vary(b, instr);
1504       else if (stage == MESA_SHADER_VERTEX)
1505          bi_emit_load_attr(b, instr);
1506       else
1507          unreachable("Unsupported shader stage");
1508       break;
1509 
1510    case nir_intrinsic_store_output:
1511       if (stage == MESA_SHADER_FRAGMENT)
1512          bi_emit_fragment_out(b, instr);
1513       else if (stage == MESA_SHADER_VERTEX)
1514          bi_emit_store_vary(b, instr);
1515       else
1516          unreachable("Unsupported shader stage");
1517       break;
1518 
1519    case nir_intrinsic_store_combined_output_pan:
1520       assert(stage == MESA_SHADER_FRAGMENT);
1521       bi_emit_fragment_out(b, instr);
1522       break;
1523 
1524    case nir_intrinsic_load_ubo:
1525       bi_emit_load_ubo(b, instr);
1526       break;
1527 
1528    case nir_intrinsic_load_push_constant:
1529       bi_emit_load_push_constant(b, instr);
1530       break;
1531 
1532    case nir_intrinsic_load_global:
1533    case nir_intrinsic_load_global_constant:
1534       bi_emit_load(b, instr, BI_SEG_NONE);
1535       break;
1536 
1537    case nir_intrinsic_store_global:
1538       bi_emit_store(b, instr, BI_SEG_NONE);
1539       break;
1540 
1541    case nir_intrinsic_load_scratch:
1542       bi_emit_load(b, instr, BI_SEG_TL);
1543       break;
1544 
1545    case nir_intrinsic_store_scratch:
1546       bi_emit_store(b, instr, BI_SEG_TL);
1547       break;
1548 
1549    case nir_intrinsic_load_shared:
1550       bi_emit_load(b, instr, BI_SEG_WLS);
1551       break;
1552 
1553    case nir_intrinsic_store_shared:
1554       bi_emit_store(b, instr, BI_SEG_WLS);
1555       break;
1556 
1557    case nir_intrinsic_barrier:
1558       if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
1559          assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1560          assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
1561                 "todo: subgroup barriers (different divergence rules)");
1562          bi_barrier(b);
1563       }
1564       /* Blob doesn't seem to do anything for memory barriers, so no need to
1565        * check nir_intrinsic_memory_scope().
1566        */
1567       break;
1568 
1569    case nir_intrinsic_shared_atomic: {
1570       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1571 
1572       if (op == nir_atomic_op_xchg) {
1573          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1574                           BI_SEG_WLS);
1575       } else {
1576          assert(nir_src_bit_size(instr->src[1]) == 32);
1577 
1578          bi_index addr = bi_src_index(&instr->src[0]);
1579          bi_index addr_hi;
1580 
1581          if (b->shader->arch >= 9) {
1582             bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1583             addr = bi_collect_v2i32(b, addr, addr_hi);
1584          } else {
1585             addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1586             bi_emit_cached_split(b, addr, 64);
1587          }
1588 
1589          bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1590       }
1591 
1592       bi_split_def(b, &instr->def);
1593       break;
1594    }
1595 
1596    case nir_intrinsic_global_atomic: {
1597       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1598 
1599       if (op == nir_atomic_op_xchg) {
1600          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1601                           BI_SEG_NONE);
1602       } else {
1603          assert(nir_src_bit_size(instr->src[1]) == 32);
1604 
1605          bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1606                                bi_src_index(&instr->src[1]), op);
1607       }
1608 
1609       bi_split_def(b, &instr->def);
1610       break;
1611    }
1612 
1613    case nir_intrinsic_image_texel_address:
1614       bi_emit_lea_image_to(b, dst, instr);
1615       break;
1616 
1617    case nir_intrinsic_image_load:
1618       bi_emit_image_load(b, instr);
1619       break;
1620 
1621    case nir_intrinsic_image_store:
1622       bi_emit_image_store(b, instr);
1623       break;
1624 
1625    case nir_intrinsic_global_atomic_swap:
1626       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1627                           &instr->src[2], BI_SEG_NONE);
1628       bi_split_def(b, &instr->def);
1629       break;
1630 
1631    case nir_intrinsic_shared_atomic_swap:
1632       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1633                           &instr->src[2], BI_SEG_WLS);
1634       bi_split_def(b, &instr->def);
1635       break;
1636 
1637    case nir_intrinsic_load_pixel_coord:
1638       /* Vectorized load of the preloaded i16vec2 */
1639       bi_mov_i32_to(b, dst, bi_preload(b, 59));
1640       break;
1641 
1642    case nir_intrinsic_load_frag_coord_zw:
1643       bi_emit_load_frag_coord_zw(b, dst, nir_intrinsic_component(instr));
1644       break;
1645 
1646    case nir_intrinsic_load_converted_output_pan:
1647       bi_emit_ld_tile(b, instr);
1648       break;
1649 
1650    case nir_intrinsic_discard_if:
1651       bi_discard_b32(b, bi_src_index(&instr->src[0]));
1652       break;
1653 
1654    case nir_intrinsic_discard:
1655       bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1656       break;
1657 
1658    case nir_intrinsic_load_sample_positions_pan:
1659       bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1660                           bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1661       break;
1662 
1663    case nir_intrinsic_load_sample_mask_in:
1664       /* r61[0:15] contains the coverage bitmap */
1665       bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1666       break;
1667 
1668    case nir_intrinsic_load_sample_mask:
1669       bi_mov_i32_to(b, dst, bi_coverage(b));
1670       break;
1671 
1672    case nir_intrinsic_load_sample_id:
1673       bi_load_sample_id_to(b, dst);
1674       break;
1675 
1676    case nir_intrinsic_load_front_face:
1677       /* r58 == 0 means primitive is front facing */
1678       bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1679                      BI_RESULT_TYPE_M1);
1680       break;
1681 
1682    case nir_intrinsic_load_point_coord:
1683       bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1684                            BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1685                            BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1686       bi_emit_cached_split_i32(b, dst, 2);
1687       break;
1688 
1689    /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1690     * not with Valhall's memory-allocation IDVS geometry flow. We only support
1691     * the new flow on Valhall so this is lowered in NIR.
1692     */
1693    case nir_intrinsic_load_vertex_id:
1694    case nir_intrinsic_load_vertex_id_zero_base:
1695       assert(b->shader->malloc_idvs ==
1696              (instr->intrinsic == nir_intrinsic_load_vertex_id));
1697 
1698       bi_mov_i32_to(b, dst, bi_vertex_id(b));
1699       break;
1700 
1701    case nir_intrinsic_load_instance_id:
1702       bi_mov_i32_to(b, dst, bi_instance_id(b));
1703       break;
1704 
1705    case nir_intrinsic_load_subgroup_invocation:
1706       bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1707       break;
1708 
1709    case nir_intrinsic_load_local_invocation_id:
1710       bi_collect_v3i32_to(b, dst,
1711                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1712                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1713                           bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1714       break;
1715 
1716    case nir_intrinsic_load_workgroup_id:
1717       bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1718                           bi_preload(b, 59));
1719       break;
1720 
1721    case nir_intrinsic_load_global_invocation_id:
1722    case nir_intrinsic_load_global_invocation_id_zero_base:
1723       bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1724                           bi_preload(b, 62));
1725       break;
1726 
1727    case nir_intrinsic_shader_clock:
1728       bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1729       bi_split_def(b, &instr->def);
1730       break;
1731 
1732    default:
1733       fprintf(stderr, "Unhandled intrinsic %s\n",
1734               nir_intrinsic_infos[instr->intrinsic].name);
1735       assert(0);
1736    }
1737 }
1738 
1739 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)1740 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1741 {
1742    /* Make sure we've been lowered */
1743    assert(instr->def.num_components <= (32 / instr->def.bit_size));
1744 
1745    /* Accumulate all the channels of the constant, as if we did an
1746     * implicit SEL over them */
1747    uint32_t acc = 0;
1748 
1749    for (unsigned i = 0; i < instr->def.num_components; ++i) {
1750       unsigned v =
1751          nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1752       acc |= (v << (i * instr->def.bit_size));
1753    }
1754 
1755    bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
1756 }
1757 
1758 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)1759 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
1760 {
1761    unsigned bitsize = nir_src_bit_size(src.src);
1762 
1763    /* the bi_index carries the 32-bit (word) offset separate from the
1764     * subword swizzle, first handle the offset */
1765 
1766    unsigned offset = 0;
1767 
1768    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1769    unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1770 
1771    for (unsigned i = 0; i < comps; ++i) {
1772       unsigned new_offset = (src.swizzle[i] >> subword_shift);
1773 
1774       if (i > 0)
1775          assert(offset == new_offset && "wrong vectorization");
1776 
1777       offset = new_offset;
1778    }
1779 
1780    bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
1781 
1782    /* Compose the subword swizzle with existing (identity) swizzle */
1783    assert(idx.swizzle == BI_SWIZZLE_H01);
1784 
1785    /* Bigger vectors should have been lowered */
1786    assert(comps <= (1 << subword_shift));
1787 
1788    if (bitsize == 16) {
1789       unsigned c0 = src.swizzle[0] & 1;
1790       unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1791       idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1792    } else if (bitsize == 8) {
1793       /* 8-bit vectors not yet supported */
1794       assert(comps == 1 && "8-bit vectors not supported");
1795       idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
1796    }
1797 
1798    return idx;
1799 }
1800 
1801 static enum bi_round
bi_nir_round(nir_op op)1802 bi_nir_round(nir_op op)
1803 {
1804    switch (op) {
1805    case nir_op_fround_even:
1806       return BI_ROUND_NONE;
1807    case nir_op_ftrunc:
1808       return BI_ROUND_RTZ;
1809    case nir_op_fceil:
1810       return BI_ROUND_RTP;
1811    case nir_op_ffloor:
1812       return BI_ROUND_RTN;
1813    default:
1814       unreachable("invalid nir round op");
1815    }
1816 }
1817 
1818 /* Convenience for lowered transcendentals */
1819 
1820 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)1821 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
1822 {
1823    return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
1824 }
1825 
1826 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
1827  * Newton-Raphson to improve precision */
1828 
1829 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)1830 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
1831 {
1832    bi_index x1 = bi_frcp_approx_f32(b, s0);
1833    bi_index m = bi_frexpm_f32(b, s0, false, false);
1834    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
1835    bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
1836                                    BI_SPECIAL_N);
1837    bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
1838 }
1839 
1840 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)1841 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
1842 {
1843    bi_index x1 = bi_frsq_approx_f32(b, s0);
1844    bi_index m = bi_frexpm_f32(b, s0, false, true);
1845    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
1846    bi_index t1 = bi_fmul_f32(b, x1, x1);
1847    bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
1848                                    bi_imm_u32(-1), BI_SPECIAL_N);
1849    bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
1850 }
1851 
1852 /* More complex transcendentals, see
1853  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
1854  * for documentation */
1855 
1856 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)1857 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
1858 {
1859    bi_index t1 = bi_temp(b->shader);
1860    bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
1861    t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
1862 
1863    bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
1864 
1865    bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
1866    a2->clamp = BI_CLAMP_CLAMP_M1_1;
1867 
1868    bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
1869    bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
1870    bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
1871    bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
1872                             bi_imm_u32(0x3e75fffa));
1873    bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
1874    bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
1875    bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
1876                                       BI_SPECIAL_NONE);
1877    x->clamp = BI_CLAMP_CLAMP_0_INF;
1878 
1879    bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
1880    max->sem = BI_SEM_NAN_PROPAGATE;
1881 }
1882 
1883 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)1884 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
1885 {
1886    /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
1887     * fixed-point input */
1888    bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
1889                                       bi_imm_u32(24), BI_SPECIAL_NONE);
1890    bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
1891    fixed_pt->round = BI_ROUND_NONE; // XXX
1892 
1893    /* Compute the result for the fixed-point input, but pass along
1894     * the floating-point scale for correct NaN propagation */
1895    bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
1896 }
1897 
1898 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1899 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1900 {
1901    /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
1902    bi_index a1 = bi_frexpm_f32(b, s0, true, false);
1903    bi_index ei = bi_frexpe_f32(b, s0, true, false);
1904    bi_index ef = bi_s32_to_f32(b, ei);
1905 
1906    /* xt estimates -log(r1), a coarse approximation of log(a1) */
1907    bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
1908    bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
1909 
1910    /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
1911     * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
1912     * and then log(s0) = x1 + x2 */
1913    bi_index x1 = bi_fadd_f32(b, ef, xt);
1914 
1915    /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
1916     * polynomial approximation around 1. The series is expressed around
1917     * 1, so set y = (a1 * r1) - 1.0 */
1918    bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
1919 
1920    /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
1921     * log_e(1 + y) by the Taylor series (lower precision than the blob):
1922     * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
1923    bi_index loge =
1924       bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
1925 
1926    bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
1927 
1928    /* log(s0) = x1 + x2 */
1929    bi_fadd_f32_to(b, dst, x1, x2);
1930 }
1931 
1932 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1933 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1934 {
1935    bi_index frexp = bi_frexpe_f32(b, s0, true, false);
1936    bi_index frexpi = bi_s32_to_f32(b, frexp);
1937    bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
1938    bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
1939 }
1940 
1941 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1942 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1943 {
1944    bi_index log2_base = bi_null();
1945 
1946    if (base.type == BI_INDEX_CONSTANT) {
1947       log2_base = bi_imm_f32(log2f(uif(base.value)));
1948    } else {
1949       log2_base = bi_temp(b->shader);
1950       bi_lower_flog2_32(b, log2_base, base);
1951    }
1952 
1953    return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
1954 }
1955 
1956 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1957 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1958 {
1959    bi_index log2_base = bi_null();
1960 
1961    if (base.type == BI_INDEX_CONSTANT) {
1962       log2_base = bi_imm_f32(log2f(uif(base.value)));
1963    } else {
1964       log2_base = bi_temp(b->shader);
1965       bi_flog2_32(b, log2_base, base);
1966    }
1967 
1968    return bi_fexp_32(b, dst, exp, log2_base);
1969 }
1970 
1971 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
1972  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
1973  * calculates the results. We use them to calculate sin/cos via a Taylor
1974  * approximation:
1975  *
1976  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
1977  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
1978  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
1979  */
1980 
1981 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
1982 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
1983 #define SINCOS_BIAS  bi_imm_u32(0x49400000)
1984 
1985 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)1986 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
1987 {
1988    /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
1989    bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
1990 
1991    /* Approximate domain error (small) */
1992    bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
1993                            MPI_OVER_TWO, s0);
1994 
1995    /* Lookup sin(x), cos(x) */
1996    bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
1997    bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
1998 
1999    /* e^2 / 2 */
2000    bi_index e2_over_2 =
2001       bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2002 
2003    /* (-e^2)/2 f''(x) */
2004    bi_index quadratic =
2005       bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2006 
2007    /* e f'(x) - (e^2/2) f''(x) */
2008    bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2009                                cos ? bi_neg(sinx) : cosx, quadratic);
2010    I->clamp = BI_CLAMP_CLAMP_M1_1;
2011 
2012    /* f(x) + e f'(x) - (e^2/2) f''(x) */
2013    bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2014 }
2015 
2016 /*
2017  * The XOR lane op is useful for derivative calculations, but not all Bifrost
2018  * implementations have it. Add a safe helper that uses the hardware
2019  * functionality when available and lowers where unavailable.
2020  */
2021 static bi_index
bi_clper_xor(bi_builder * b,bi_index s0,bi_index s1)2022 bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
2023 {
2024    if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
2025       return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
2026                           BI_SUBGROUP_SUBGROUP4);
2027    }
2028 
2029    bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
2030    bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
2031    return bi_clper_old_i32(b, s0, lane);
2032 }
2033 
2034 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2035 bi_translate_cmpf(nir_op op)
2036 {
2037    switch (op) {
2038    case nir_op_ieq8:
2039    case nir_op_ieq16:
2040    case nir_op_ieq32:
2041    case nir_op_feq16:
2042    case nir_op_feq32:
2043       return BI_CMPF_EQ;
2044 
2045    case nir_op_ine8:
2046    case nir_op_ine16:
2047    case nir_op_ine32:
2048    case nir_op_fneu16:
2049    case nir_op_fneu32:
2050       return BI_CMPF_NE;
2051 
2052    case nir_op_ilt8:
2053    case nir_op_ilt16:
2054    case nir_op_ilt32:
2055    case nir_op_flt16:
2056    case nir_op_flt32:
2057    case nir_op_ult8:
2058    case nir_op_ult16:
2059    case nir_op_ult32:
2060       return BI_CMPF_LT;
2061 
2062    case nir_op_ige8:
2063    case nir_op_ige16:
2064    case nir_op_ige32:
2065    case nir_op_fge16:
2066    case nir_op_fge32:
2067    case nir_op_uge8:
2068    case nir_op_uge16:
2069    case nir_op_uge32:
2070       return BI_CMPF_GE;
2071 
2072    default:
2073       unreachable("invalid comparison");
2074    }
2075 }
2076 
2077 static bool
bi_nir_is_replicated(nir_alu_src * src)2078 bi_nir_is_replicated(nir_alu_src *src)
2079 {
2080    for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2081       if (src->swizzle[0] == src->swizzle[i])
2082          return false;
2083    }
2084 
2085    return true;
2086 }
2087 
2088 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2089 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2090 {
2091    bi_index dst = bi_def_index(&instr->def);
2092    unsigned srcs = nir_op_infos[instr->op].num_inputs;
2093    unsigned sz = instr->def.bit_size;
2094    unsigned comps = instr->def.num_components;
2095    unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2096 
2097    /* Indicate scalarness */
2098    if (sz == 16 && comps == 1)
2099       dst.swizzle = BI_SWIZZLE_H00;
2100 
2101    /* First, match against the various moves in NIR. These are
2102     * special-cased because they can operate on vectors even after
2103     * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2104     * instruction is no "bigger" than SIMD-within-a-register. These moves
2105     * are the exceptions that need to handle swizzles specially. */
2106 
2107    switch (instr->op) {
2108    case nir_op_vec2:
2109    case nir_op_vec3:
2110    case nir_op_vec4:
2111    case nir_op_vec8:
2112    case nir_op_vec16: {
2113       bi_index unoffset_srcs[16] = {bi_null()};
2114       unsigned channels[16] = {0};
2115 
2116       for (unsigned i = 0; i < srcs; ++i) {
2117          unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2118          channels[i] = instr->src[i].swizzle[0];
2119       }
2120 
2121       bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2122       return;
2123    }
2124 
2125    case nir_op_unpack_32_2x16: {
2126       /* Should have been scalarized */
2127       assert(comps == 2 && sz == 16);
2128 
2129       bi_index vec = bi_src_index(&instr->src[0].src);
2130       unsigned chan = instr->src[0].swizzle[0];
2131 
2132       bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2133       return;
2134    }
2135 
2136    case nir_op_unpack_64_2x32_split_x: {
2137       unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2138       bi_mov_i32_to(b, dst,
2139                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2140       return;
2141    }
2142 
2143    case nir_op_unpack_64_2x32_split_y: {
2144       unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2145       bi_mov_i32_to(b, dst,
2146                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2147       return;
2148    }
2149 
2150    case nir_op_pack_64_2x32_split:
2151       bi_collect_v2i32_to(b, dst,
2152                           bi_extract(b, bi_src_index(&instr->src[0].src),
2153                                      instr->src[0].swizzle[0]),
2154                           bi_extract(b, bi_src_index(&instr->src[1].src),
2155                                      instr->src[1].swizzle[0]));
2156       return;
2157 
2158    case nir_op_pack_64_2x32:
2159       bi_collect_v2i32_to(b, dst,
2160                           bi_extract(b, bi_src_index(&instr->src[0].src),
2161                                      instr->src[0].swizzle[0]),
2162                           bi_extract(b, bi_src_index(&instr->src[0].src),
2163                                      instr->src[0].swizzle[1]));
2164       return;
2165 
2166    case nir_op_pack_uvec2_to_uint: {
2167       bi_index src = bi_src_index(&instr->src[0].src);
2168 
2169       assert(sz == 32 && src_sz == 32);
2170       bi_mkvec_v2i16_to(
2171          b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2172          bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2173       return;
2174    }
2175 
2176    case nir_op_pack_uvec4_to_uint: {
2177       bi_index src = bi_src_index(&instr->src[0].src);
2178 
2179       assert(sz == 32 && src_sz == 32);
2180       bi_mkvec_v4i8_to(
2181          b, dst, bi_byte(bi_extract(b, src, instr->src[0].swizzle[0]), 0),
2182          bi_byte(bi_extract(b, src, instr->src[0].swizzle[1]), 0),
2183          bi_byte(bi_extract(b, src, instr->src[0].swizzle[2]), 0),
2184          bi_byte(bi_extract(b, src, instr->src[0].swizzle[3]), 0));
2185       return;
2186    }
2187 
2188    case nir_op_mov: {
2189       bi_index idx = bi_src_index(&instr->src[0].src);
2190       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2191 
2192       unsigned channels[4] = {
2193          comps > 0 ? instr->src[0].swizzle[0] : 0,
2194          comps > 1 ? instr->src[0].swizzle[1] : 0,
2195          comps > 2 ? instr->src[0].swizzle[2] : 0,
2196          comps > 3 ? instr->src[0].swizzle[3] : 0,
2197       };
2198 
2199       bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2200       return;
2201    }
2202 
2203    case nir_op_pack_32_2x16: {
2204       assert(comps == 1);
2205 
2206       bi_index idx = bi_src_index(&instr->src[0].src);
2207       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2208 
2209       unsigned channels[2] = {instr->src[0].swizzle[0],
2210                               instr->src[0].swizzle[1]};
2211 
2212       bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2213       return;
2214    }
2215 
2216    case nir_op_f2f16:
2217    case nir_op_f2f16_rtz:
2218    case nir_op_f2f16_rtne: {
2219       assert(src_sz == 32);
2220       bi_index idx = bi_src_index(&instr->src[0].src);
2221       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2222       bi_index s1 =
2223          comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2224 
2225       bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2226 
2227       /* Override rounding if explicitly requested. Otherwise, the
2228        * default rounding mode is selected by the builder. Depending
2229        * on the float controls required by the shader, the default
2230        * mode may not be nearest-even.
2231        */
2232       if (instr->op == nir_op_f2f16_rtz)
2233          I->round = BI_ROUND_RTZ;
2234       else if (instr->op == nir_op_f2f16_rtne)
2235          I->round = BI_ROUND_NONE; /* Nearest even */
2236 
2237       return;
2238    }
2239 
2240    /* Vectorized downcasts */
2241    case nir_op_u2u16:
2242    case nir_op_i2i16: {
2243       if (!(src_sz == 32 && comps == 2))
2244          break;
2245 
2246       bi_index idx = bi_src_index(&instr->src[0].src);
2247       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2248       bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2249 
2250       bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2251       return;
2252    }
2253 
2254    /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2255     * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2256     * scalarizing due to scheduling (equal cost on Valhall). Additionally
2257     * if the source is replicated the MKVEC.v2i16 can be optimized out.
2258     */
2259    case nir_op_u2f16:
2260    case nir_op_i2f16: {
2261       if (!(src_sz == 32 && comps == 2))
2262          break;
2263 
2264       nir_alu_src *src = &instr->src[0];
2265       bi_index idx = bi_src_index(&src->src);
2266       bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2267       bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2268 
2269       bi_index t =
2270          (src->swizzle[0] == src->swizzle[1])
2271             ? bi_half(s0, false)
2272             : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));
2273 
2274       if (instr->op == nir_op_u2f16)
2275          bi_v2u16_to_v2f16_to(b, dst, t);
2276       else
2277          bi_v2s16_to_v2f16_to(b, dst, t);
2278 
2279       return;
2280    }
2281 
2282    case nir_op_i2i8:
2283    case nir_op_u2u8: {
2284       /* Acts like an 8-bit swizzle */
2285       bi_index idx = bi_src_index(&instr->src[0].src);
2286       unsigned factor = src_sz / 8;
2287       unsigned chan[4] = {0};
2288 
2289       for (unsigned i = 0; i < comps; ++i)
2290          chan[i] = instr->src[0].swizzle[i] * factor;
2291 
2292       bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2293       return;
2294    }
2295 
2296    case nir_op_b32csel: {
2297       if (sz != 16)
2298          break;
2299 
2300       /* We allow vectorizing b32csel(cond, A, B) which can be
2301        * translated as MUX.v2i16, even though cond is a 32-bit vector.
2302        *
2303        * If the source condition vector is replicated, we can use
2304        * MUX.v2i16 directly, letting each component use the
2305        * corresponding half of the 32-bit source. NIR uses 0/~0
2306        * booleans so that's guaranteed to work (that is, 32-bit NIR
2307        * booleans are 16-bit replicated).
2308        *
2309        * If we're not replicated, we use the same trick but must
2310        * insert a MKVEC.v2i16 first to convert down to 16-bit.
2311        */
2312       bi_index idx = bi_src_index(&instr->src[0].src);
2313       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2314       bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2315       bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2316 
2317       if (!bi_nir_is_replicated(&instr->src[0])) {
2318          s0 = bi_mkvec_v2i16(
2319             b, bi_half(s0, false),
2320             bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2321       }
2322 
2323       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2324       return;
2325    }
2326 
2327    default:
2328       break;
2329    }
2330 
2331    bi_index s0 =
2332       srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2333    bi_index s1 =
2334       srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2335    bi_index s2 =
2336       srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2337 
2338    switch (instr->op) {
2339    case nir_op_ffma:
2340       bi_fma_to(b, sz, dst, s0, s1, s2);
2341       break;
2342 
2343    case nir_op_fmul:
2344       bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2345       break;
2346 
2347    case nir_op_fadd:
2348       bi_fadd_to(b, sz, dst, s0, s1);
2349       break;
2350 
2351    case nir_op_fsat: {
2352       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2353       I->clamp = BI_CLAMP_CLAMP_0_1;
2354       break;
2355    }
2356 
2357    case nir_op_fsat_signed_mali: {
2358       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2359       I->clamp = BI_CLAMP_CLAMP_M1_1;
2360       break;
2361    }
2362 
2363    case nir_op_fclamp_pos_mali: {
2364       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2365       I->clamp = BI_CLAMP_CLAMP_0_INF;
2366       break;
2367    }
2368 
2369    case nir_op_fneg:
2370       bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2371       break;
2372 
2373    case nir_op_fabs:
2374       bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2375       break;
2376 
2377    case nir_op_fsin:
2378       bi_lower_fsincos_32(b, dst, s0, false);
2379       break;
2380 
2381    case nir_op_fcos:
2382       bi_lower_fsincos_32(b, dst, s0, true);
2383       break;
2384 
2385    case nir_op_fexp2:
2386       assert(sz == 32); /* should've been lowered */
2387 
2388       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2389          bi_lower_fexp2_32(b, dst, s0);
2390       else
2391          bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2392 
2393       break;
2394 
2395    case nir_op_flog2:
2396       assert(sz == 32); /* should've been lowered */
2397 
2398       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2399          bi_lower_flog2_32(b, dst, s0);
2400       else
2401          bi_flog2_32(b, dst, s0);
2402 
2403       break;
2404 
2405    case nir_op_fpow:
2406       assert(sz == 32); /* should've been lowered */
2407 
2408       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2409          bi_lower_fpow_32(b, dst, s0, s1);
2410       else
2411          bi_fpow_32(b, dst, s0, s1);
2412 
2413       break;
2414 
2415    case nir_op_frexp_exp:
2416       bi_frexpe_to(b, sz, dst, s0, false, false);
2417       break;
2418 
2419    case nir_op_frexp_sig:
2420       bi_frexpm_to(b, sz, dst, s0, false, false);
2421       break;
2422 
2423    case nir_op_ldexp:
2424       bi_ldexp_to(b, sz, dst, s0, s1);
2425       break;
2426 
2427    case nir_op_b8csel:
2428       bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2429       break;
2430 
2431    case nir_op_b16csel:
2432       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2433       break;
2434 
2435    case nir_op_b32csel:
2436       bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2437       break;
2438 
2439    case nir_op_extract_u8:
2440    case nir_op_extract_i8: {
2441       assert(comps == 1 && "should be scalarized");
2442       assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2443       unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2444 
2445       if (s0.swizzle == BI_SWIZZLE_H11) {
2446          assert(byte < 2);
2447          byte += 2;
2448       } else if (s0.swizzle != BI_SWIZZLE_H01) {
2449          assert(s0.swizzle == BI_SWIZZLE_H00);
2450       }
2451 
2452       assert(byte < 4);
2453 
2454       s0.swizzle = BI_SWIZZLE_H01;
2455 
2456       if (instr->op == nir_op_extract_i8)
2457          bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2458       else
2459          bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2460       break;
2461    }
2462 
2463    case nir_op_extract_u16:
2464    case nir_op_extract_i16: {
2465       assert(comps == 1 && "should be scalarized");
2466       assert(src_sz == 32 && "should be lowered");
2467       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2468       assert(half == 0 || half == 1);
2469 
2470       if (instr->op == nir_op_extract_i16)
2471          bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2472       else
2473          bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2474       break;
2475    }
2476 
2477    case nir_op_insert_u16: {
2478       assert(comps == 1 && "should be scalarized");
2479       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2480       assert(half == 0 || half == 1);
2481 
2482       if (half == 0)
2483          bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2484       else
2485          bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2486       break;
2487    }
2488 
2489    case nir_op_ishl:
2490       bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2491       break;
2492    case nir_op_ushr:
2493       bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2494       break;
2495 
2496    case nir_op_ishr:
2497       if (b->shader->arch >= 9)
2498          bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2499       else
2500          bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2501       break;
2502 
2503    case nir_op_imin:
2504    case nir_op_umin:
2505       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2506                  s1, BI_CMPF_LT);
2507       break;
2508 
2509    case nir_op_imax:
2510    case nir_op_umax:
2511       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2512                  s1, BI_CMPF_GT);
2513       break;
2514 
2515    case nir_op_fddx_must_abs_mali:
2516    case nir_op_fddy_must_abs_mali: {
2517       bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2518       bi_index adjacent = bi_clper_xor(b, s0, bit);
2519       bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
2520       break;
2521    }
2522 
2523    case nir_op_fddx:
2524    case nir_op_fddy:
2525    case nir_op_fddx_coarse:
2526    case nir_op_fddy_coarse:
2527    case nir_op_fddx_fine:
2528    case nir_op_fddy_fine: {
2529       unsigned axis;
2530       switch (instr->op) {
2531       case nir_op_fddx:
2532       case nir_op_fddx_coarse:
2533       case nir_op_fddx_fine:
2534          axis = 1;
2535          break;
2536       case nir_op_fddy:
2537       case nir_op_fddy_coarse:
2538       case nir_op_fddy_fine:
2539          axis = 2;
2540          break;
2541       default:
2542          unreachable("Invalid derivative op");
2543       }
2544 
2545       bi_index lane1, lane2;
2546       switch (instr->op) {
2547       case nir_op_fddx:
2548       case nir_op_fddx_fine:
2549       case nir_op_fddy:
2550       case nir_op_fddy_fine:
2551          lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
2552                                    bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
2553 
2554          lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
2555          break;
2556       case nir_op_fddx_coarse:
2557       case nir_op_fddy_coarse:
2558          lane1 = bi_imm_u32(0);
2559          lane2 = bi_imm_u32(axis);
2560          break;
2561       default:
2562          unreachable("Invalid derivative op");
2563       }
2564 
2565       bi_index left, right;
2566 
2567       if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2568          left = bi_clper_old_i32(b, s0, lane1);
2569          right = bi_clper_old_i32(b, s0, lane2);
2570       } else {
2571          left = bi_clper_i32(b, s0, lane1, BI_INACTIVE_RESULT_ZERO,
2572                              BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4);
2573 
2574          right = bi_clper_i32(b, s0, lane2, BI_INACTIVE_RESULT_ZERO,
2575                               BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4);
2576       }
2577 
2578       bi_fadd_to(b, sz, dst, right, bi_neg(left));
2579       break;
2580    }
2581 
2582    case nir_op_f2f32:
2583       bi_f16_to_f32_to(b, dst, s0);
2584       break;
2585 
2586    case nir_op_fquantize2f16: {
2587       bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2588       bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2589 
2590       f16->ftz = f32->ftz = true;
2591       break;
2592    }
2593 
2594    case nir_op_f2i32:
2595       if (src_sz == 32)
2596          bi_f32_to_s32_to(b, dst, s0);
2597       else
2598          bi_f16_to_s32_to(b, dst, s0);
2599       break;
2600 
2601    /* Note 32-bit sources => no vectorization, so 32-bit works */
2602    case nir_op_f2u16:
2603       if (src_sz == 32)
2604          bi_f32_to_u32_to(b, dst, s0);
2605       else
2606          bi_v2f16_to_v2u16_to(b, dst, s0);
2607       break;
2608 
2609    case nir_op_f2i16:
2610       if (src_sz == 32)
2611          bi_f32_to_s32_to(b, dst, s0);
2612       else
2613          bi_v2f16_to_v2s16_to(b, dst, s0);
2614       break;
2615 
2616    case nir_op_f2u32:
2617       if (src_sz == 32)
2618          bi_f32_to_u32_to(b, dst, s0);
2619       else
2620          bi_f16_to_u32_to(b, dst, s0);
2621       break;
2622 
2623    case nir_op_u2f16:
2624       if (src_sz == 32)
2625          bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2626       else if (src_sz == 16)
2627          bi_v2u16_to_v2f16_to(b, dst, s0);
2628       else if (src_sz == 8)
2629          bi_v2u8_to_v2f16_to(b, dst, s0);
2630       break;
2631 
2632    case nir_op_u2f32:
2633       if (src_sz == 32)
2634          bi_u32_to_f32_to(b, dst, s0);
2635       else if (src_sz == 16)
2636          bi_u16_to_f32_to(b, dst, s0);
2637       else
2638          bi_u8_to_f32_to(b, dst, s0);
2639       break;
2640 
2641    case nir_op_i2f16:
2642       if (src_sz == 32)
2643          bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2644       else if (src_sz == 16)
2645          bi_v2s16_to_v2f16_to(b, dst, s0);
2646       else if (src_sz == 8)
2647          bi_v2s8_to_v2f16_to(b, dst, s0);
2648       break;
2649 
2650    case nir_op_i2f32:
2651       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2652 
2653       if (src_sz == 32)
2654          bi_s32_to_f32_to(b, dst, s0);
2655       else if (src_sz == 16)
2656          bi_s16_to_f32_to(b, dst, s0);
2657       else if (src_sz == 8)
2658          bi_s8_to_f32_to(b, dst, s0);
2659       break;
2660 
2661    case nir_op_i2i32:
2662       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2663 
2664       if (src_sz == 32)
2665          bi_mov_i32_to(b, dst, s0);
2666       else if (src_sz == 16)
2667          bi_s16_to_s32_to(b, dst, s0);
2668       else if (src_sz == 8)
2669          bi_s8_to_s32_to(b, dst, s0);
2670       break;
2671 
2672    case nir_op_u2u32:
2673       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2674 
2675       if (src_sz == 32)
2676          bi_mov_i32_to(b, dst, s0);
2677       else if (src_sz == 16)
2678          bi_u16_to_u32_to(b, dst, s0);
2679       else if (src_sz == 8)
2680          bi_u8_to_u32_to(b, dst, s0);
2681 
2682       break;
2683 
2684    case nir_op_i2i16:
2685       assert(src_sz == 8 || src_sz == 32);
2686 
2687       if (src_sz == 8)
2688          bi_v2s8_to_v2s16_to(b, dst, s0);
2689       else
2690          bi_mov_i32_to(b, dst, s0);
2691       break;
2692 
2693    case nir_op_u2u16:
2694       assert(src_sz == 8 || src_sz == 32);
2695 
2696       if (src_sz == 8)
2697          bi_v2u8_to_v2u16_to(b, dst, s0);
2698       else
2699          bi_mov_i32_to(b, dst, s0);
2700       break;
2701 
2702    case nir_op_b2i8:
2703    case nir_op_b2i16:
2704    case nir_op_b2i32:
2705       bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
2706                 BI_MUX_INT_ZERO);
2707       break;
2708 
2709    case nir_op_ieq8:
2710    case nir_op_ine8:
2711    case nir_op_ilt8:
2712    case nir_op_ige8:
2713    case nir_op_ieq16:
2714    case nir_op_ine16:
2715    case nir_op_ilt16:
2716    case nir_op_ige16:
2717    case nir_op_ieq32:
2718    case nir_op_ine32:
2719    case nir_op_ilt32:
2720    case nir_op_ige32:
2721       bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2722                  BI_RESULT_TYPE_M1);
2723       break;
2724 
2725    case nir_op_ult8:
2726    case nir_op_uge8:
2727    case nir_op_ult16:
2728    case nir_op_uge16:
2729    case nir_op_ult32:
2730    case nir_op_uge32:
2731       bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
2732                  bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2733       break;
2734 
2735    case nir_op_feq32:
2736    case nir_op_feq16:
2737    case nir_op_flt32:
2738    case nir_op_flt16:
2739    case nir_op_fge32:
2740    case nir_op_fge16:
2741    case nir_op_fneu32:
2742    case nir_op_fneu16:
2743       bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2744                  BI_RESULT_TYPE_M1);
2745       break;
2746 
2747    case nir_op_fround_even:
2748    case nir_op_fceil:
2749    case nir_op_ffloor:
2750    case nir_op_ftrunc:
2751       bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2752       break;
2753 
2754    case nir_op_fmin:
2755       bi_fmin_to(b, sz, dst, s0, s1);
2756       break;
2757 
2758    case nir_op_fmax:
2759       bi_fmax_to(b, sz, dst, s0, s1);
2760       break;
2761 
2762    case nir_op_iadd:
2763       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2764       break;
2765 
2766    case nir_op_iadd_sat:
2767       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2768       break;
2769 
2770    case nir_op_uadd_sat:
2771       bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2772       break;
2773 
2774    case nir_op_ihadd:
2775       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2776       break;
2777 
2778    case nir_op_irhadd:
2779       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2780       break;
2781 
2782    case nir_op_uhadd:
2783       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
2784       break;
2785 
2786    case nir_op_urhadd:
2787       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
2788       break;
2789 
2790    case nir_op_ineg:
2791       bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2792       break;
2793 
2794    case nir_op_isub:
2795       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2796       break;
2797 
2798    case nir_op_isub_sat:
2799       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2800       break;
2801 
2802    case nir_op_usub_sat:
2803       bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2804       break;
2805 
2806    case nir_op_imul:
2807       bi_imul_to(b, sz, dst, s0, s1);
2808       break;
2809 
2810    case nir_op_iabs:
2811       bi_iabs_to(b, sz, dst, s0);
2812       break;
2813 
2814    case nir_op_iand:
2815       bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2816       break;
2817 
2818    case nir_op_ior:
2819       bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2820       break;
2821 
2822    case nir_op_ixor:
2823       bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2824       break;
2825 
2826    case nir_op_inot:
2827       bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2828       break;
2829 
2830    case nir_op_frsq:
2831       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2832          bi_lower_frsq_32(b, dst, s0);
2833       else
2834          bi_frsq_to(b, sz, dst, s0);
2835       break;
2836 
2837    case nir_op_frcp:
2838       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2839          bi_lower_frcp_32(b, dst, s0);
2840       else
2841          bi_frcp_to(b, sz, dst, s0);
2842       break;
2843 
2844    case nir_op_uclz:
2845       bi_clz_to(b, sz, dst, s0, false);
2846       break;
2847 
2848    case nir_op_bit_count:
2849       assert(sz == 32 && src_sz == 32 && "should've been lowered");
2850       bi_popcount_i32_to(b, dst, s0);
2851       break;
2852 
2853    case nir_op_bitfield_reverse:
2854       assert(sz == 32 && src_sz == 32 && "should've been lowered");
2855       bi_bitrev_i32_to(b, dst, s0);
2856       break;
2857 
2858    case nir_op_ufind_msb: {
2859       bi_index clz = bi_clz(b, src_sz, s0, false);
2860 
2861       if (sz == 8)
2862          clz = bi_byte(clz, 0);
2863       else if (sz == 16)
2864          clz = bi_half(clz, false);
2865 
2866       bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
2867       break;
2868    }
2869 
2870    default:
2871       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2872       unreachable("Unknown ALU op");
2873    }
2874 }
2875 
2876 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
2877  * Midgard */
2878 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)2879 bifrost_tex_format(enum glsl_sampler_dim dim)
2880 {
2881    switch (dim) {
2882    case GLSL_SAMPLER_DIM_1D:
2883    case GLSL_SAMPLER_DIM_BUF:
2884       return 1;
2885 
2886    case GLSL_SAMPLER_DIM_2D:
2887    case GLSL_SAMPLER_DIM_MS:
2888    case GLSL_SAMPLER_DIM_EXTERNAL:
2889    case GLSL_SAMPLER_DIM_RECT:
2890       return 2;
2891 
2892    case GLSL_SAMPLER_DIM_3D:
2893       return 3;
2894 
2895    case GLSL_SAMPLER_DIM_CUBE:
2896       return 0;
2897 
2898    default:
2899       DBG("Unknown sampler dim type\n");
2900       assert(0);
2901       return 0;
2902    }
2903 }
2904 
2905 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)2906 valhall_tex_dimension(enum glsl_sampler_dim dim)
2907 {
2908    switch (dim) {
2909    case GLSL_SAMPLER_DIM_1D:
2910    case GLSL_SAMPLER_DIM_BUF:
2911       return BI_DIMENSION_1D;
2912 
2913    case GLSL_SAMPLER_DIM_2D:
2914    case GLSL_SAMPLER_DIM_MS:
2915    case GLSL_SAMPLER_DIM_EXTERNAL:
2916    case GLSL_SAMPLER_DIM_RECT:
2917       return BI_DIMENSION_2D;
2918 
2919    case GLSL_SAMPLER_DIM_3D:
2920       return BI_DIMENSION_3D;
2921 
2922    case GLSL_SAMPLER_DIM_CUBE:
2923       return BI_DIMENSION_CUBE;
2924 
2925    default:
2926       unreachable("Unknown sampler dim type");
2927    }
2928 }
2929 
2930 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)2931 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
2932 {
2933    switch (T) {
2934    case nir_type_float16:
2935       return BIFROST_TEXTURE_FORMAT_F16 + clamp;
2936    case nir_type_float32:
2937       return BIFROST_TEXTURE_FORMAT_F32 + clamp;
2938    case nir_type_uint16:
2939       return BIFROST_TEXTURE_FORMAT_U16;
2940    case nir_type_int16:
2941       return BIFROST_TEXTURE_FORMAT_S16;
2942    case nir_type_uint32:
2943       return BIFROST_TEXTURE_FORMAT_U32;
2944    case nir_type_int32:
2945       return BIFROST_TEXTURE_FORMAT_S32;
2946    default:
2947       unreachable("Invalid type for texturing");
2948    }
2949 }
2950 
2951 /* Array indices are specified as 32-bit uints, need to convert. In .z component
2952  * from NIR */
2953 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)2954 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
2955 {
2956    /* For (u)int we can just passthrough */
2957    nir_alu_type base = nir_alu_type_get_base_type(T);
2958    if (base == nir_type_int || base == nir_type_uint)
2959       return idx;
2960 
2961    /* Otherwise we convert */
2962    assert(T == nir_type_float32);
2963 
2964    /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
2965     * Texel Selection") defines the layer to be taken from clamp(RNE(r),
2966     * 0, dt - 1). So we use round RTE, clamping is handled at the data
2967     * structure level */
2968 
2969    bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
2970    I->round = BI_ROUND_NONE;
2971    return I->dest[0];
2972 }
2973 
2974 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
2975  * 16-bit 8:8 fixed-point format. We lower as:
2976  *
2977  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
2978  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
2979  */
2980 
2981 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)2982 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
2983 {
2984    /* Precompute for constant LODs to avoid general constant folding */
2985    if (lod.type == BI_INDEX_CONSTANT) {
2986       uint32_t raw = lod.value;
2987       float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
2988       int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
2989       return bi_imm_u32(s32 & 0xFFFF);
2990    }
2991 
2992    /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
2993     * the max LOD (16 since we cap at 2^16 texture dimensions), and
2994     * preferably small to minimize precision loss */
2995    const float max_lod = 16.0;
2996 
2997    bi_instr *fsat =
2998       bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
2999                     bi_imm_f32(1.0f / max_lod), bi_negzero());
3000 
3001    fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3002 
3003    bi_index fmul =
3004       bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3005 
3006    return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3007                          bi_imm_u16(0));
3008 }
3009 
3010 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3011  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3012  * TODO: Cube face.
3013  */
3014 
3015 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3016 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3017 {
3018    return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3019 }
3020 
3021 /* The hardware specifies texel offsets and multisample indices together as a
3022  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3023  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3024  * the bits we need and return that to be passed as a staging register. Else we
3025  * return 0 to avoid allocating a data register when everything is zero. */
3026 
3027 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3028 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3029 {
3030    bi_index dest = bi_zero();
3031 
3032    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3033    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3034                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3035       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3036       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3037       dest = bi_mkvec_v4i8(
3038          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3039          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3040          (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3041          bi_imm_u8(0));
3042    }
3043 
3044    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3045    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3046                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3047       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3048                               bi_imm_u8(24));
3049    }
3050 
3051    return dest;
3052 }
3053 
3054 /*
3055  * Valhall specifies specifies texel offsets, multisample indices, and (for
3056  * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3057  * component is either offset.z or multisample index depending on context. Build
3058  * this register.
3059  */
3060 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3061 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3062 {
3063    bi_index dest = bi_zero();
3064 
3065    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3066    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3067    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3068 
3069    /* Components 0-2: offsets */
3070    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3071                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3072       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3073       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3074 
3075       /* No multisample index with 3D */
3076       assert((nr <= 2) || (ms_idx < 0));
3077 
3078       /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3079       bi_index z = (nr > 2)
3080                       ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3081                                       bi_imm_u8(0), bi_zero())
3082                       : bi_zero();
3083 
3084       dest = bi_mkvec_v2i8(
3085          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3086          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3087    }
3088 
3089    /* Component 2: multisample index */
3090    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3091                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3092       dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3093    }
3094 
3095    /* Component 3: 8-bit LOD */
3096    if (lod_idx >= 0 &&
3097        (!nir_src_is_const(instr->src[lod_idx].src) ||
3098         nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3099        nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3100       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3101                               bi_imm_u8(24));
3102    }
3103 
3104    return dest;
3105 }
3106 
3107 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3108 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3109                    bi_index *t)
3110 {
3111    /* Compute max { |x|, |y|, |z| } */
3112    bi_index maxxyz = bi_temp(b->shader);
3113    *face = bi_temp(b->shader);
3114 
3115    bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3116             cz = bi_extract(b, coord, 2);
3117 
3118    /* Use a pseudo op on Bifrost due to tuple restrictions */
3119    if (b->shader->arch <= 8) {
3120       bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3121    } else {
3122       bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3123       bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3124    }
3125 
3126    /* Select coordinates */
3127    bi_index ssel =
3128       bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3129    bi_index tsel =
3130       bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3131 
3132    /* The OpenGL ES specification requires us to transform an input vector
3133     * (x, y, z) to the coordinate, given the selected S/T:
3134     *
3135     * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3136     *
3137     * We implement (s shown, t similar) in a form friendlier to FMA
3138     * instructions, and clamp coordinates at the end for correct
3139     * NaN/infinity handling:
3140     *
3141     * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3142     *
3143     * Take the reciprocal of max{x, y, z}
3144     */
3145    bi_index rcp = bi_frcp_f32(b, maxxyz);
3146 
3147    /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3148    bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3149 
3150    /* Transform the coordinates */
3151    *s = bi_temp(b->shader);
3152    *t = bi_temp(b->shader);
3153 
3154    bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3155    bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3156 
3157    S->clamp = BI_CLAMP_CLAMP_0_1;
3158    T->clamp = BI_CLAMP_CLAMP_0_1;
3159 }
3160 
3161 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3162  * 32-bits in passed pointer t. The packing of the face with the S coordinate
3163  * exploits the redundancy of floating points with the range restriction of
3164  * CUBEFACE output.
3165  *
3166  *     struct cube_map_descriptor {
3167  *         float s : 29;
3168  *         unsigned face : 3;
3169  *         float t : 32;
3170  *     }
3171  *
3172  * Since the cube face index is preshifted, this is easy to pack with a bitwise
3173  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3174  * bits from face.
3175  */
3176 
3177 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3178 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3179 {
3180    bi_index face, s;
3181    bi_emit_cube_coord(b, coord, &face, &s, t);
3182    bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3183    return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3184 }
3185 
3186 /* Map to the main texture op used. Some of these (txd in particular) will
3187  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3188  * sequence). We assume that lowering is handled elsewhere.
3189  */
3190 
3191 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3192 bi_tex_op(nir_texop op)
3193 {
3194    switch (op) {
3195    case nir_texop_tex:
3196    case nir_texop_txb:
3197    case nir_texop_txl:
3198    case nir_texop_txd:
3199       return BIFROST_TEX_OP_TEX;
3200    case nir_texop_txf:
3201    case nir_texop_txf_ms:
3202    case nir_texop_tg4:
3203       return BIFROST_TEX_OP_FETCH;
3204    case nir_texop_txs:
3205    case nir_texop_lod:
3206    case nir_texop_query_levels:
3207    case nir_texop_texture_samples:
3208    case nir_texop_samples_identical:
3209       unreachable("should've been lowered");
3210    default:
3211       unreachable("unsupported tex op");
3212    }
3213 }
3214 
3215 /* Data registers required by texturing in the order they appear. All are
3216  * optional, the texture operation descriptor determines which are present.
3217  * Note since 3D arrays are not permitted at an API level, Z_COORD and
3218  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3219 
3220 enum bifrost_tex_dreg {
3221    BIFROST_TEX_DREG_Z_COORD = 0,
3222    BIFROST_TEX_DREG_Y_DELTAS = 1,
3223    BIFROST_TEX_DREG_LOD = 2,
3224    BIFROST_TEX_DREG_GRDESC_HI = 3,
3225    BIFROST_TEX_DREG_SHADOW = 4,
3226    BIFROST_TEX_DREG_ARRAY = 5,
3227    BIFROST_TEX_DREG_OFFSETMS = 6,
3228    BIFROST_TEX_DREG_SAMPLER = 7,
3229    BIFROST_TEX_DREG_TEXTURE = 8,
3230    BIFROST_TEX_DREG_COUNT,
3231 };
3232 
3233 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3234 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3235 {
3236    struct bifrost_texture_operation desc = {
3237       .op = bi_tex_op(instr->op),
3238       .offset_or_bias_disable = false, /* TODO */
3239       .shadow_or_clamp_disable = instr->is_shadow,
3240       .array = instr->is_array,
3241       .dimension = bifrost_tex_format(instr->sampler_dim),
3242       .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3243                                   BI_CLAMP_NONE), /* TODO */
3244       .mask = 0xF,
3245    };
3246 
3247    switch (desc.op) {
3248    case BIFROST_TEX_OP_TEX:
3249       desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3250       break;
3251    case BIFROST_TEX_OP_FETCH:
3252       desc.lod_or_fetch = (enum bifrost_lod_mode)(
3253          instr->op == nir_texop_tg4
3254             ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3255             : BIFROST_TEXTURE_FETCH_TEXEL);
3256       break;
3257    default:
3258       unreachable("texture op unsupported");
3259    }
3260 
3261    /* 32-bit indices to be allocated as consecutive staging registers */
3262    bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3263    bi_index cx = bi_null(), cy = bi_null();
3264 
3265    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3266       bi_index index = bi_src_index(&instr->src[i].src);
3267       unsigned sz = nir_src_bit_size(instr->src[i].src);
3268       unsigned components = nir_src_num_components(instr->src[i].src);
3269       ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3270       nir_alu_type T = base | sz;
3271 
3272       switch (instr->src[i].src_type) {
3273       case nir_tex_src_coord:
3274          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3275             cx = bi_emit_texc_cube_coord(b, index, &cy);
3276          } else {
3277             /* Copy XY (for 2D+) or XX (for 1D) */
3278             cx = bi_extract(b, index, 0);
3279             cy = bi_extract(b, index, MIN2(1, components - 1));
3280 
3281             assert(components >= 1 && components <= 3);
3282 
3283             if (components == 3 && !desc.array) {
3284                /* 3D */
3285                dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3286             }
3287          }
3288 
3289          if (desc.array) {
3290             dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3291                b, bi_extract(b, index, components - 1), T);
3292          }
3293 
3294          break;
3295 
3296       case nir_tex_src_lod:
3297          if (desc.op == BIFROST_TEX_OP_TEX &&
3298              nir_src_is_const(instr->src[i].src) &&
3299              nir_src_as_uint(instr->src[i].src) == 0) {
3300             desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3301          } else if (desc.op == BIFROST_TEX_OP_TEX) {
3302             assert(base == nir_type_float);
3303 
3304             assert(sz == 16 || sz == 32);
3305             dregs[BIFROST_TEX_DREG_LOD] =
3306                bi_emit_texc_lod_88(b, index, sz == 16);
3307             desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3308          } else {
3309             assert(desc.op == BIFROST_TEX_OP_FETCH);
3310             assert(base == nir_type_uint || base == nir_type_int);
3311             assert(sz == 16 || sz == 32);
3312 
3313             dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3314          }
3315 
3316          break;
3317 
3318       case nir_tex_src_bias:
3319          /* Upper 16-bits interpreted as a clamp, leave zero */
3320          assert(desc.op == BIFROST_TEX_OP_TEX);
3321          assert(base == nir_type_float);
3322          assert(sz == 16 || sz == 32);
3323          dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3324          desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3325          break;
3326 
3327       case nir_tex_src_ms_index:
3328       case nir_tex_src_offset:
3329          if (desc.offset_or_bias_disable)
3330             break;
3331 
3332          dregs[BIFROST_TEX_DREG_OFFSETMS] =
3333             bi_emit_texc_offset_ms_index(b, instr);
3334          if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3335             desc.offset_or_bias_disable = true;
3336          break;
3337 
3338       case nir_tex_src_comparator:
3339          dregs[BIFROST_TEX_DREG_SHADOW] = index;
3340          break;
3341 
3342       case nir_tex_src_texture_offset:
3343          dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3344          break;
3345 
3346       case nir_tex_src_sampler_offset:
3347          dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3348          break;
3349 
3350       default:
3351          unreachable("Unhandled src type in texc emit");
3352       }
3353    }
3354 
3355    if (desc.op == BIFROST_TEX_OP_FETCH &&
3356        bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3357       dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3358    }
3359 
3360    /* Choose an index mode */
3361 
3362    bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3363    bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3364    bool direct = direct_tex && direct_samp;
3365 
3366    desc.immediate_indices = direct && (instr->sampler_index < 16);
3367 
3368    if (desc.immediate_indices) {
3369       desc.sampler_index_or_mode = instr->sampler_index;
3370       desc.index = instr->texture_index;
3371    } else {
3372       unsigned mode = 0;
3373 
3374       if (direct && instr->sampler_index == instr->texture_index) {
3375          mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3376          desc.index = instr->texture_index;
3377       } else if (direct) {
3378          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3379          desc.index = instr->sampler_index;
3380          dregs[BIFROST_TEX_DREG_TEXTURE] =
3381             bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3382       } else if (direct_tex) {
3383          assert(!direct_samp);
3384          mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3385          desc.index = instr->texture_index;
3386       } else if (direct_samp) {
3387          assert(!direct_tex);
3388          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3389          desc.index = instr->sampler_index;
3390       } else {
3391          mode = BIFROST_INDEX_REGISTER;
3392       }
3393 
3394       mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3395       desc.sampler_index_or_mode = mode;
3396    }
3397 
3398    /* Allocate staging registers contiguously by compacting the array. */
3399    unsigned sr_count = 0;
3400 
3401    for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3402       if (!bi_is_null(dregs[i]))
3403          dregs[sr_count++] = dregs[i];
3404    }
3405 
3406    unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3407 
3408    bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3409    bi_index dst = bi_temp(b->shader);
3410 
3411    if (sr_count)
3412       bi_emit_collect_to(b, sr, dregs, sr_count);
3413 
3414    uint32_t desc_u = 0;
3415    memcpy(&desc_u, &desc, sizeof(desc_u));
3416    bi_instr *I =
3417       bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u),
3418                  !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3419    I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3420 
3421    bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3422    bi_emit_split_i32(b, w, dst, res_size);
3423    bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3424                       DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3425 }
3426 
3427 /* Staging registers required by texturing in the order they appear (Valhall) */
3428 
3429 enum valhall_tex_sreg {
3430    VALHALL_TEX_SREG_X_COORD = 0,
3431    VALHALL_TEX_SREG_Y_COORD = 1,
3432    VALHALL_TEX_SREG_Z_COORD = 2,
3433    VALHALL_TEX_SREG_Y_DELTAS = 3,
3434    VALHALL_TEX_SREG_ARRAY = 4,
3435    VALHALL_TEX_SREG_SHADOW = 5,
3436    VALHALL_TEX_SREG_OFFSETMS = 6,
3437    VALHALL_TEX_SREG_LOD = 7,
3438    VALHALL_TEX_SREG_GRDESC = 8,
3439    VALHALL_TEX_SREG_COUNT,
3440 };
3441 
3442 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3443 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3444 {
3445    bool explicit_offset = false;
3446    enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3447 
3448    bool has_lod_mode = (instr->op == nir_texop_tex) ||
3449                        (instr->op == nir_texop_txl) ||
3450                        (instr->op == nir_texop_txb);
3451 
3452    /* 32-bit indices to be allocated as consecutive staging registers */
3453    bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3454    bi_index sampler = bi_imm_u32(instr->sampler_index);
3455    bi_index texture = bi_imm_u32(instr->texture_index);
3456    uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27);
3457 
3458    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3459       bi_index index = bi_src_index(&instr->src[i].src);
3460       unsigned sz = nir_src_bit_size(instr->src[i].src);
3461       unsigned components = nir_src_num_components(instr->src[i].src);
3462 
3463       switch (instr->src[i].src_type) {
3464       case nir_tex_src_coord:
3465          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3466             sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3467                b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3468          } else {
3469             assert(components >= 1 && components <= 3);
3470 
3471             /* Copy XY (for 2D+) or XX (for 1D) */
3472             sregs[VALHALL_TEX_SREG_X_COORD] = index;
3473 
3474             if (components >= 2)
3475                sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3476 
3477             if (components == 3 && !instr->is_array) {
3478                sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3479             }
3480          }
3481 
3482          if (instr->is_array) {
3483             sregs[VALHALL_TEX_SREG_ARRAY] =
3484                bi_extract(b, index, components - 1);
3485          }
3486 
3487          break;
3488 
3489       case nir_tex_src_lod:
3490          if (nir_src_is_const(instr->src[i].src) &&
3491              nir_src_as_uint(instr->src[i].src) == 0) {
3492             lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3493          } else if (has_lod_mode) {
3494             lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3495 
3496             assert(sz == 16 || sz == 32);
3497             sregs[VALHALL_TEX_SREG_LOD] =
3498                bi_emit_texc_lod_88(b, index, sz == 16);
3499          }
3500          break;
3501 
3502       case nir_tex_src_bias:
3503          /* Upper 16-bits interpreted as a clamp, leave zero */
3504          assert(sz == 16 || sz == 32);
3505          sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3506 
3507          lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3508          break;
3509       case nir_tex_src_ms_index:
3510       case nir_tex_src_offset:
3511          /* Handled below */
3512          break;
3513 
3514       case nir_tex_src_comparator:
3515          sregs[VALHALL_TEX_SREG_SHADOW] = index;
3516          break;
3517 
3518       case nir_tex_src_texture_offset:
3519          texture = index;
3520          break;
3521 
3522       case nir_tex_src_sampler_offset:
3523          sampler = index;
3524          break;
3525 
3526       default:
3527          unreachable("Unhandled src type in tex emit");
3528       }
3529    }
3530 
3531    /* Generate packed offset + ms index + LOD register. These default to
3532     * zero so we only need to encode if these features are actually in use.
3533     */
3534    bi_index offsets = bi_emit_valhall_offsets(b, instr);
3535 
3536    if (!bi_is_equiv(offsets, bi_zero())) {
3537       sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3538       explicit_offset = true;
3539    }
3540 
3541    /* Allocate staging registers contiguously by compacting the array. */
3542    unsigned sr_count = 0;
3543 
3544    for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3545       if (!bi_is_null(sregs[i]))
3546          sregs[sr_count++] = sregs[i];
3547    }
3548 
3549    bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3550 
3551    if (sr_count)
3552       bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3553 
3554    bi_index image_src = bi_imm_u32(tables);
3555    image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
3556    image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
3557 
3558    /* Only write the components that we actually read */
3559    unsigned mask = nir_def_components_read(&instr->def);
3560    unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
3561    unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
3562 
3563    enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3564    enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3565    bi_index dest = bi_temp(b->shader);
3566 
3567    switch (instr->op) {
3568    case nir_texop_tex:
3569    case nir_texop_txl:
3570    case nir_texop_txb:
3571       bi_tex_single_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3572                        regfmt, instr->is_shadow, explicit_offset, lod_mode,
3573                        mask, sr_count);
3574       break;
3575    case nir_texop_txf:
3576    case nir_texop_txf_ms:
3577       bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3578                       regfmt, explicit_offset, mask, sr_count);
3579       break;
3580    case nir_texop_tg4:
3581       bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3582                        instr->component, false, regfmt, instr->is_shadow,
3583                        explicit_offset, mask, sr_count);
3584       break;
3585    default:
3586       unreachable("Unhandled Valhall texture op");
3587    }
3588 
3589    /* The hardware will write only what we read, and it will into
3590     * contiguous registers without gaps (different from Bifrost). NIR
3591     * expects the gaps, so fill in the holes (they'll be copypropped and
3592     * DCE'd away later).
3593     */
3594    bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3595 
3596    bi_emit_cached_split_i32(b, dest, res_size);
3597 
3598    /* Index into the packed component array */
3599    unsigned j = 0;
3600    unsigned comps[4] = {0};
3601    unsigned nr_components = instr->def.num_components;
3602 
3603    for (unsigned i = 0; i < nr_components; ++i) {
3604       if (mask & BITFIELD_BIT(i)) {
3605          unpacked[i] = dest;
3606          comps[i] = j++;
3607       } else {
3608          unpacked[i] = bi_zero();
3609       }
3610    }
3611 
3612    bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
3613                   instr->def.num_components, instr->def.bit_size);
3614 }
3615 
3616 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3617  * textures with sufficiently small immediate indices. Anything else
3618  * needs a complete texture op. */
3619 
3620 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)3621 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3622 {
3623    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3624    assert(coord_idx >= 0);
3625    bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3626 
3627    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3628       bi_index face, s, t;
3629       bi_emit_cube_coord(b, coords, &face, &s, &t);
3630 
3631       bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
3632                       face, instr->sampler_index, instr->texture_index);
3633    } else {
3634       bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
3635                     bi_extract(b, coords, 0), bi_extract(b, coords, 1),
3636                     instr->op != nir_texop_tex, /* zero LOD */
3637                     instr->sampler_index, instr->texture_index);
3638    }
3639 
3640    bi_split_def(b, &instr->def);
3641 }
3642 
3643 static bool
bi_is_simple_tex(nir_tex_instr * instr)3644 bi_is_simple_tex(nir_tex_instr *instr)
3645 {
3646    if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3647       return false;
3648 
3649    if (instr->dest_type != nir_type_float32 &&
3650        instr->dest_type != nir_type_float16)
3651       return false;
3652 
3653    if (instr->is_shadow || instr->is_array)
3654       return false;
3655 
3656    switch (instr->sampler_dim) {
3657    case GLSL_SAMPLER_DIM_2D:
3658    case GLSL_SAMPLER_DIM_EXTERNAL:
3659    case GLSL_SAMPLER_DIM_RECT:
3660       break;
3661 
3662    case GLSL_SAMPLER_DIM_CUBE:
3663       /* LOD can't be specified with TEXS_CUBE */
3664       if (instr->op == nir_texop_txl)
3665          return false;
3666       break;
3667 
3668    default:
3669       return false;
3670    }
3671 
3672    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3673       if (instr->src[i].src_type != nir_tex_src_lod &&
3674           instr->src[i].src_type != nir_tex_src_coord)
3675          return false;
3676    }
3677 
3678    /* Indices need to fit in provided bits */
3679    unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3680    if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3681       return false;
3682 
3683    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3684    if (lod_idx < 0)
3685       return true;
3686 
3687    nir_src lod = instr->src[lod_idx].src;
3688    return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3689 }
3690 
3691 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)3692 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3693 {
3694    /* If txf is used, we assume there is a valid sampler bound at index 0. Use
3695     * it for txf operations, since there may be no other valid samplers. This is
3696     * a workaround: txf does not require a sampler in NIR (so sampler_index is
3697     * undefined) but we need one in the hardware. This is ABI with the driver.
3698     */
3699    if (!nir_tex_instr_need_sampler(instr))
3700       instr->sampler_index = 0;
3701 
3702    if (b->shader->arch >= 9)
3703       bi_emit_tex_valhall(b, instr);
3704    else if (bi_is_simple_tex(instr))
3705       bi_emit_texs(b, instr);
3706    else
3707       bi_emit_texc(b, instr);
3708 }
3709 
3710 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)3711 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
3712 {
3713    unsigned nr_srcs = exec_list_length(&instr->srcs);
3714    bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
3715 
3716    /* Deferred */
3717    I->phi = instr;
3718 }
3719 
3720 /* Look up the AGX block corresponding to a given NIR block. Used when
3721  * translating phi nodes after emitting all blocks.
3722  */
3723 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)3724 bi_from_nir_block(bi_context *ctx, nir_block *block)
3725 {
3726    return ctx->indexed_nir_blocks[block->index];
3727 }
3728 
3729 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)3730 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
3731 {
3732    nir_phi_instr *phi = I->phi;
3733 
3734    /* Guaranteed by lower_phis_to_scalar */
3735    assert(phi->def.num_components == 1);
3736 
3737    nir_foreach_phi_src(src, phi) {
3738       bi_block *pred = bi_from_nir_block(ctx, src->pred);
3739       unsigned i = bi_predecessor_index(block, pred);
3740       assert(i < I->nr_srcs);
3741 
3742       I->src[i] = bi_src_index(&src->src);
3743    }
3744 
3745    I->phi = NULL;
3746 }
3747 
3748 static void
bi_emit_phis_deferred(bi_context * ctx)3749 bi_emit_phis_deferred(bi_context *ctx)
3750 {
3751    bi_foreach_block(ctx, block) {
3752       bi_foreach_instr_in_block(block, I) {
3753          if (I->op == BI_OPCODE_PHI)
3754             bi_emit_phi_deferred(ctx, block, I);
3755       }
3756    }
3757 }
3758 
3759 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)3760 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
3761 {
3762    switch (instr->type) {
3763    case nir_instr_type_load_const:
3764       bi_emit_load_const(b, nir_instr_as_load_const(instr));
3765       break;
3766 
3767    case nir_instr_type_intrinsic:
3768       bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
3769       break;
3770 
3771    case nir_instr_type_alu:
3772       bi_emit_alu(b, nir_instr_as_alu(instr));
3773       break;
3774 
3775    case nir_instr_type_tex:
3776       bi_emit_tex(b, nir_instr_as_tex(instr));
3777       break;
3778 
3779    case nir_instr_type_jump:
3780       bi_emit_jump(b, nir_instr_as_jump(instr));
3781       break;
3782 
3783    case nir_instr_type_phi:
3784       bi_emit_phi(b, nir_instr_as_phi(instr));
3785       break;
3786 
3787    default:
3788       unreachable("should've been lowered");
3789    }
3790 }
3791 
3792 static bi_block *
create_empty_block(bi_context * ctx)3793 create_empty_block(bi_context *ctx)
3794 {
3795    bi_block *blk = rzalloc(ctx, bi_block);
3796 
3797    util_dynarray_init(&blk->predecessors, blk);
3798 
3799    return blk;
3800 }
3801 
3802 static bi_block *
emit_block(bi_context * ctx,nir_block * block)3803 emit_block(bi_context *ctx, nir_block *block)
3804 {
3805    if (ctx->after_block) {
3806       ctx->current_block = ctx->after_block;
3807       ctx->after_block = NULL;
3808    } else {
3809       ctx->current_block = create_empty_block(ctx);
3810    }
3811 
3812    list_addtail(&ctx->current_block->link, &ctx->blocks);
3813    list_inithead(&ctx->current_block->instructions);
3814 
3815    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3816 
3817    ctx->indexed_nir_blocks[block->index] = ctx->current_block;
3818 
3819    nir_foreach_instr(instr, block) {
3820       bi_emit_instr(&_b, instr);
3821    }
3822 
3823    return ctx->current_block;
3824 }
3825 
3826 static void
emit_if(bi_context * ctx,nir_if * nif)3827 emit_if(bi_context *ctx, nir_if *nif)
3828 {
3829    bi_block *before_block = ctx->current_block;
3830 
3831    /* Speculatively emit the branch, but we can't fill it in until later */
3832    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3833    bi_instr *then_branch =
3834       bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
3835                      bi_zero(), BI_CMPF_EQ);
3836 
3837    /* Emit the two subblocks. */
3838    bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
3839    bi_block *end_then_block = ctx->current_block;
3840 
3841    /* Emit second block */
3842 
3843    bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
3844    bi_block *end_else_block = ctx->current_block;
3845    ctx->after_block = create_empty_block(ctx);
3846 
3847    /* Now that we have the subblocks emitted, fix up the branches */
3848 
3849    assert(then_block);
3850    assert(else_block);
3851 
3852    then_branch->branch_target = else_block;
3853 
3854    /* Emit a jump from the end of the then block to the end of the else */
3855    _b.cursor = bi_after_block(end_then_block);
3856    bi_instr *then_exit = bi_jump(&_b, bi_zero());
3857    then_exit->branch_target = ctx->after_block;
3858 
3859    bi_block_add_successor(end_then_block, then_exit->branch_target);
3860    bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
3861 
3862    bi_block_add_successor(before_block,
3863                           then_branch->branch_target); /* then_branch */
3864    bi_block_add_successor(before_block, then_block);   /* fallthrough */
3865 }
3866 
3867 static void
emit_loop(bi_context * ctx,nir_loop * nloop)3868 emit_loop(bi_context *ctx, nir_loop *nloop)
3869 {
3870    assert(!nir_loop_has_continue_construct(nloop));
3871 
3872    /* Remember where we are */
3873    bi_block *start_block = ctx->current_block;
3874 
3875    bi_block *saved_break = ctx->break_block;
3876    bi_block *saved_continue = ctx->continue_block;
3877 
3878    ctx->continue_block = create_empty_block(ctx);
3879    ctx->break_block = create_empty_block(ctx);
3880    ctx->after_block = ctx->continue_block;
3881 
3882    /* Emit the body itself */
3883    emit_cf_list(ctx, &nloop->body);
3884 
3885    /* Branch back to loop back */
3886    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3887    bi_instr *I = bi_jump(&_b, bi_zero());
3888    I->branch_target = ctx->continue_block;
3889    bi_block_add_successor(start_block, ctx->continue_block);
3890    bi_block_add_successor(ctx->current_block, ctx->continue_block);
3891 
3892    ctx->after_block = ctx->break_block;
3893 
3894    /* Pop off */
3895    ctx->break_block = saved_break;
3896    ctx->continue_block = saved_continue;
3897    ++ctx->loop_count;
3898 }
3899 
3900 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)3901 emit_cf_list(bi_context *ctx, struct exec_list *list)
3902 {
3903    bi_block *start_block = NULL;
3904 
3905    foreach_list_typed(nir_cf_node, node, node, list) {
3906       switch (node->type) {
3907       case nir_cf_node_block: {
3908          bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3909 
3910          if (!start_block)
3911             start_block = block;
3912 
3913          break;
3914       }
3915 
3916       case nir_cf_node_if:
3917          emit_if(ctx, nir_cf_node_as_if(node));
3918          break;
3919 
3920       case nir_cf_node_loop:
3921          emit_loop(ctx, nir_cf_node_as_loop(node));
3922          break;
3923 
3924       default:
3925          unreachable("Unknown control flow");
3926       }
3927    }
3928 
3929    return start_block;
3930 }
3931 
3932 /* shader-db stuff */
3933 
3934 struct bi_stats {
3935    unsigned nr_clauses, nr_tuples, nr_ins;
3936    unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
3937 };
3938 
3939 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)3940 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
3941 {
3942    /* Count instructions */
3943    stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
3944 
3945    /* Non-message passing tuples are always arithmetic */
3946    if (tuple->add != clause->message) {
3947       stats->nr_arith++;
3948       return;
3949    }
3950 
3951    /* Message + FMA we'll count as arithmetic _and_ message */
3952    if (tuple->fma)
3953       stats->nr_arith++;
3954 
3955    switch (clause->message_type) {
3956    case BIFROST_MESSAGE_VARYING:
3957       /* Check components interpolated */
3958       stats->nr_varying +=
3959          (clause->message->vecsize + 1) *
3960          (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
3961       break;
3962 
3963    case BIFROST_MESSAGE_VARTEX:
3964       /* 2 coordinates, fp32 each */
3965       stats->nr_varying += (2 * 2);
3966       FALLTHROUGH;
3967    case BIFROST_MESSAGE_TEX:
3968       stats->nr_texture++;
3969       break;
3970 
3971    case BIFROST_MESSAGE_ATTRIBUTE:
3972    case BIFROST_MESSAGE_LOAD:
3973    case BIFROST_MESSAGE_STORE:
3974    case BIFROST_MESSAGE_ATOMIC:
3975       stats->nr_ldst++;
3976       break;
3977 
3978    case BIFROST_MESSAGE_NONE:
3979    case BIFROST_MESSAGE_BARRIER:
3980    case BIFROST_MESSAGE_BLEND:
3981    case BIFROST_MESSAGE_TILE:
3982    case BIFROST_MESSAGE_Z_STENCIL:
3983    case BIFROST_MESSAGE_ATEST:
3984    case BIFROST_MESSAGE_JOB:
3985    case BIFROST_MESSAGE_64BIT:
3986       /* Nothing to do */
3987       break;
3988    };
3989 }
3990 
3991 /*
3992  * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
3993  * shader completes. These costs are not accounted for in the general cycle
3994  * counts, so this function calculates the effective cost of these messages, as
3995  * if they were executed by shader code.
3996  */
3997 static unsigned
bi_count_preload_cost(bi_context * ctx)3998 bi_count_preload_cost(bi_context *ctx)
3999 {
4000    /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4001     * 16 fp16 varying components per cycle or fetch two texels per cycle.
4002     */
4003    unsigned cost = 0;
4004 
4005    for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4006       struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4007 
4008       if (msg.enabled && msg.texture) {
4009          /* 2 coordinate, 2 half-words each, plus texture */
4010          cost += 12;
4011       } else if (msg.enabled) {
4012          cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4013       }
4014    }
4015 
4016    return cost;
4017 }
4018 
4019 static const char *
bi_shader_stage_name(bi_context * ctx)4020 bi_shader_stage_name(bi_context *ctx)
4021 {
4022    if (ctx->idvs == BI_IDVS_VARYING)
4023       return "MESA_SHADER_VARYING";
4024    else if (ctx->idvs == BI_IDVS_POSITION)
4025       return "MESA_SHADER_POSITION";
4026    else if (ctx->inputs->is_blend)
4027       return "MESA_SHADER_BLEND";
4028    else
4029       return gl_shader_stage_name(ctx->stage);
4030 }
4031 
4032 static char *
bi_print_stats(bi_context * ctx,unsigned size)4033 bi_print_stats(bi_context *ctx, unsigned size)
4034 {
4035    struct bi_stats stats = {0};
4036 
4037    /* Count instructions, clauses, and tuples. Also attempt to construct
4038     * normalized execution engine cycle counts, using the following ratio:
4039     *
4040     * 24 arith tuples/cycle
4041     * 2 texture messages/cycle
4042     * 16 x 16-bit varying channels interpolated/cycle
4043     * 1 load store message/cycle
4044     *
4045     * These numbers seem to match Arm Mobile Studio's heuristic. The real
4046     * cycle counts are surely more complicated.
4047     */
4048 
4049    bi_foreach_block(ctx, block) {
4050       bi_foreach_clause_in_block(block, clause) {
4051          stats.nr_clauses++;
4052          stats.nr_tuples += clause->tuple_count;
4053 
4054          for (unsigned i = 0; i < clause->tuple_count; ++i)
4055             bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4056       }
4057    }
4058 
4059    float cycles_arith = ((float)stats.nr_arith) / 24.0;
4060    float cycles_texture = ((float)stats.nr_texture) / 2.0;
4061    float cycles_varying = ((float)stats.nr_varying) / 16.0;
4062    float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4063 
4064    float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4065    float cycles_bound = MAX2(cycles_arith, cycles_message);
4066 
4067    /* Thread count and register pressure are traded off only on v7 */
4068    bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4069    unsigned nr_threads = full_threads ? 2 : 1;
4070 
4071    /* Dump stats */
4072    char *str = ralloc_asprintf(
4073       NULL,
4074       "%s shader: "
4075       "%u inst, %u tuples, %u clauses, "
4076       "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4077       "%u quadwords, %u threads",
4078       bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4079       stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4080       cycles_varying, cycles_ldst, size / 16, nr_threads);
4081 
4082    if (ctx->arch == 7) {
4083       ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4084    }
4085 
4086    ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4087                           ctx->loop_count, ctx->spills, ctx->fills);
4088 
4089    return str;
4090 }
4091 
4092 static char *
va_print_stats(bi_context * ctx,unsigned size)4093 va_print_stats(bi_context *ctx, unsigned size)
4094 {
4095    unsigned nr_ins = 0;
4096    struct va_stats stats = {0};
4097 
4098    /* Count instructions */
4099    bi_foreach_instr_global(ctx, I) {
4100       nr_ins++;
4101       va_count_instr_stats(I, &stats);
4102    }
4103 
4104    /* Mali G78 peak performance:
4105     *
4106     * 64 FMA instructions per cycle
4107     * 64 CVT instructions per cycle
4108     * 16 SFU instructions per cycle
4109     * 8 x 32-bit varying channels interpolated per cycle
4110     * 4 texture instructions per cycle
4111     * 1 load/store operation per cycle
4112     */
4113 
4114    float cycles_fma = ((float)stats.fma) / 64.0;
4115    float cycles_cvt = ((float)stats.cvt) / 64.0;
4116    float cycles_sfu = ((float)stats.sfu) / 16.0;
4117    float cycles_v = ((float)stats.v) / 16.0;
4118    float cycles_t = ((float)stats.t) / 4.0;
4119    float cycles_ls = ((float)stats.ls) / 1.0;
4120 
4121    /* Calculate the bound */
4122    float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4123                        MAX3(cycles_v, cycles_t, cycles_ls));
4124 
4125    /* Thread count and register pressure are traded off */
4126    unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4127 
4128    /* Dump stats */
4129    return ralloc_asprintf(NULL,
4130                           "%s shader: "
4131                           "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4132                           "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4133                           "%u:%u spills:fills",
4134                           bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4135                           cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4136                           size / 16, nr_threads, ctx->loop_count, ctx->spills,
4137                           ctx->fills);
4138 }
4139 
4140 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4141 glsl_type_size(const struct glsl_type *type, bool bindless)
4142 {
4143    return glsl_count_attribute_slots(type, false);
4144 }
4145 
4146 /* Split stores to memory. We don't split stores to vertex outputs, since
4147  * nir_lower_io_to_temporaries will ensure there's only a single write.
4148  */
4149 
4150 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4151 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4152 {
4153    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4154 
4155    switch (intr->intrinsic) {
4156    case nir_intrinsic_store_ssbo:
4157    case nir_intrinsic_store_shared:
4158    case nir_intrinsic_store_global:
4159    case nir_intrinsic_store_scratch:
4160       return true;
4161    default:
4162       return false;
4163    }
4164 }
4165 
4166 /*
4167  * Some operations are only available as 32-bit instructions. 64-bit floats are
4168  * unsupported and ints are lowered with nir_lower_int64.  Certain 8-bit and
4169  * 16-bit instructions, however, are lowered here.
4170  */
4171 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4172 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4173 {
4174    if (instr->type != nir_instr_type_alu)
4175       return 0;
4176 
4177    nir_alu_instr *alu = nir_instr_as_alu(instr);
4178 
4179    switch (alu->op) {
4180    case nir_op_fexp2:
4181    case nir_op_flog2:
4182    case nir_op_fpow:
4183    case nir_op_fsin:
4184    case nir_op_fcos:
4185    case nir_op_bit_count:
4186    case nir_op_bitfield_reverse:
4187       return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4188    default:
4189       return 0;
4190    }
4191 }
4192 
4193 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4194  * transcendentals are an exception. Also shifts because of lane size mismatch
4195  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4196  * to be scalarized due to type size. */
4197 
4198 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4199 bi_vectorize_filter(const nir_instr *instr, const void *data)
4200 {
4201    /* Defaults work for everything else */
4202    if (instr->type != nir_instr_type_alu)
4203       return 0;
4204 
4205    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4206 
4207    switch (alu->op) {
4208    case nir_op_frcp:
4209    case nir_op_frsq:
4210    case nir_op_ishl:
4211    case nir_op_ishr:
4212    case nir_op_ushr:
4213    case nir_op_f2i16:
4214    case nir_op_f2u16:
4215    case nir_op_extract_u8:
4216    case nir_op_extract_i8:
4217    case nir_op_extract_u16:
4218    case nir_op_extract_i16:
4219    case nir_op_insert_u16:
4220       return 1;
4221    default:
4222       break;
4223    }
4224 
4225    /* Vectorized instructions cannot write more than 32-bit */
4226    int dst_bit_size = alu->def.bit_size;
4227    if (dst_bit_size == 16)
4228       return 2;
4229    else
4230       return 1;
4231 }
4232 
4233 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4234 bi_scalarize_filter(const nir_instr *instr, const void *data)
4235 {
4236    if (instr->type != nir_instr_type_alu)
4237       return false;
4238 
4239    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4240 
4241    switch (alu->op) {
4242    case nir_op_pack_uvec2_to_uint:
4243    case nir_op_pack_uvec4_to_uint:
4244       return false;
4245    default:
4246       return true;
4247    }
4248 }
4249 
4250 /* Ensure we write exactly 4 components */
4251 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4252 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4253                           unsigned first, unsigned mask)
4254 {
4255    if (!(mask & BITFIELD_BIT(channel)))
4256       channel = first;
4257 
4258    return nir_channel(b, in, channel);
4259 }
4260 
4261 /* Lower fragment store_output instructions to always write 4 components,
4262  * matching the hardware semantic. This may require additional moves. Skipping
4263  * these moves is possible in theory, but invokes undefined behaviour in the
4264  * compiler. The DDK inserts these moves, so we will as well. */
4265 
4266 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4267 bifrost_nir_lower_blend_components(struct nir_builder *b,
4268                                    nir_intrinsic_instr *intr, void *data)
4269 {
4270    if (intr->intrinsic != nir_intrinsic_store_output)
4271       return false;
4272 
4273    nir_def *in = intr->src[0].ssa;
4274    unsigned first = nir_intrinsic_component(intr);
4275    unsigned mask = nir_intrinsic_write_mask(intr);
4276 
4277    assert(first == 0 && "shouldn't get nonzero components");
4278 
4279    /* Nothing to do */
4280    if (mask == BITFIELD_MASK(4))
4281       return false;
4282 
4283    b->cursor = nir_before_instr(&intr->instr);
4284 
4285    /* Replicate the first valid component instead */
4286    nir_def *replicated =
4287       nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4288                bifrost_nir_valid_channel(b, in, 1, first, mask),
4289                bifrost_nir_valid_channel(b, in, 2, first, mask),
4290                bifrost_nir_valid_channel(b, in, 3, first, mask));
4291 
4292    /* Rewrite to use our replicated version */
4293    nir_src_rewrite(&intr->src[0], replicated);
4294    nir_intrinsic_set_component(intr, 0);
4295    nir_intrinsic_set_write_mask(intr, 0xF);
4296    intr->num_components = 4;
4297 
4298    return true;
4299 }
4300 
4301 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)4302 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4303                          uint8_t input_bit_size, uint32_t align,
4304                          uint32_t align_offset, bool offset_is_const,
4305                          const void *cb_data)
4306 {
4307    align = nir_combined_align(align, align_offset);
4308    assert(util_is_power_of_two_nonzero(align));
4309 
4310    /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4311     * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4312     */
4313    unsigned bit_size = (bytes & 1) ? 8 : (bytes & 2) ? 16 : 32;
4314 
4315    /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4316     * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4317     * the size.
4318     */
4319    if (align == 1)
4320       bit_size = 8;
4321    else if (align == 2)
4322       bit_size = MIN2(bit_size, 16);
4323 
4324    return (nir_mem_access_size_align){
4325       .num_components = MIN2(bytes / (bit_size / 8), 4),
4326       .bit_size = bit_size,
4327       .align = bit_size / 8,
4328    };
4329 }
4330 
4331 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4332 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4333 {
4334    bool progress;
4335 
4336    do {
4337       progress = false;
4338 
4339       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4340       NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4341 
4342       NIR_PASS(progress, nir, nir_copy_prop);
4343       NIR_PASS(progress, nir, nir_opt_remove_phis);
4344       NIR_PASS(progress, nir, nir_opt_dce);
4345       NIR_PASS(progress, nir, nir_opt_dead_cf);
4346       NIR_PASS(progress, nir, nir_opt_cse);
4347       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4348       NIR_PASS(progress, nir, nir_opt_algebraic);
4349       NIR_PASS(progress, nir, nir_opt_constant_folding);
4350 
4351       NIR_PASS(progress, nir, nir_opt_undef);
4352       NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4353 
4354       NIR_PASS(progress, nir, nir_opt_shrink_vectors);
4355       NIR_PASS(progress, nir, nir_opt_loop_unroll);
4356    } while (progress);
4357 
4358    /* TODO: Why is 64-bit getting rematerialized?
4359     * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4360    NIR_PASS(progress, nir, nir_lower_int64);
4361 
4362    /* We need to cleanup after each iteration of late algebraic
4363     * optimizations, since otherwise NIR can produce weird edge cases
4364     * (like fneg of a constant) which we don't handle */
4365    bool late_algebraic = true;
4366    while (late_algebraic) {
4367       late_algebraic = false;
4368       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4369       NIR_PASS(progress, nir, nir_opt_constant_folding);
4370       NIR_PASS(progress, nir, nir_copy_prop);
4371       NIR_PASS(progress, nir, nir_opt_dce);
4372       NIR_PASS(progress, nir, nir_opt_cse);
4373    }
4374 
4375    /* This opt currently helps on Bifrost but not Valhall */
4376    if (gpu_id < 0x9000)
4377       NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
4378 
4379    NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4380    NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4381    NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4382 
4383    /* Prepass to simplify instruction selection */
4384    late_algebraic = false;
4385    NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4386 
4387    while (late_algebraic) {
4388       late_algebraic = false;
4389       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4390       NIR_PASS(progress, nir, nir_opt_constant_folding);
4391       NIR_PASS(progress, nir, nir_copy_prop);
4392       NIR_PASS(progress, nir, nir_opt_dce);
4393       NIR_PASS(progress, nir, nir_opt_cse);
4394    }
4395 
4396    NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4397    NIR_PASS(progress, nir, nir_opt_dce);
4398 
4399    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4400       NIR_PASS_V(nir, nir_shader_intrinsics_pass,
4401                  bifrost_nir_lower_blend_components,
4402                  nir_metadata_block_index | nir_metadata_dominance, NULL);
4403    }
4404 
4405    /* Backend scheduler is purely local, so do some global optimizations
4406     * to reduce register pressure. */
4407    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
4408                                nir_move_load_input | nir_move_comparisons |
4409                                nir_move_copies | nir_move_load_ssbo;
4410 
4411    NIR_PASS_V(nir, nir_opt_sink, move_all);
4412    NIR_PASS_V(nir, nir_opt_move, move_all);
4413 
4414    /* We might lower attribute, varying, and image indirects. Use the
4415     * gathered info to skip the extra analysis in the happy path. */
4416    bool any_indirects = nir->info.inputs_read_indirectly ||
4417                         nir->info.outputs_accessed_indirectly ||
4418                         nir->info.patch_inputs_read_indirectly ||
4419                         nir->info.patch_outputs_accessed_indirectly ||
4420                         nir->info.images_used[0];
4421 
4422    if (any_indirects) {
4423       nir_convert_to_lcssa(nir, true, true);
4424       NIR_PASS_V(nir, nir_divergence_analysis);
4425       NIR_PASS_V(nir, bi_lower_divergent_indirects,
4426                  pan_subgroup_size(gpu_id >> 12));
4427    }
4428 }
4429 
4430 static void
bi_opt_post_ra(bi_context * ctx)4431 bi_opt_post_ra(bi_context *ctx)
4432 {
4433    bi_foreach_instr_global_safe(ctx, ins) {
4434       if (ins->op == BI_OPCODE_MOV_I32 &&
4435           bi_is_equiv(ins->dest[0], ins->src[0]))
4436          bi_remove_instruction(ins);
4437    }
4438 }
4439 
4440 /* Dead code elimination for branches at the end of a block - only one branch
4441  * per block is legal semantically, but unreachable jumps can be generated.
4442  * Likewise on Bifrost we can generate jumps to the terminal block which need
4443  * to be lowered away to a jump to #0x0, which induces successful termination.
4444  * That trick doesn't work on Valhall, which needs a NOP inserted in the
4445  * terminal block instead.
4446  */
4447 static void
bi_lower_branch(bi_context * ctx,bi_block * block)4448 bi_lower_branch(bi_context *ctx, bi_block *block)
4449 {
4450    bool cull_terminal = (ctx->arch <= 8);
4451    bool branched = false;
4452 
4453    bi_foreach_instr_in_block_safe(block, ins) {
4454       if (!ins->branch_target)
4455          continue;
4456 
4457       if (branched) {
4458          bi_remove_instruction(ins);
4459          continue;
4460       }
4461 
4462       branched = true;
4463 
4464       if (!bi_is_terminal_block(ins->branch_target))
4465          continue;
4466 
4467       if (cull_terminal)
4468          ins->branch_target = NULL;
4469       else if (ins->branch_target)
4470          ins->branch_target->needs_nop = true;
4471    }
4472 }
4473 
4474 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)4475 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4476 {
4477    unsigned final_clause = bi_pack(ctx, binary);
4478 
4479    /* If we need to wait for ATEST or BLEND in the first clause, pass the
4480     * corresponding bits through to the renderer state descriptor */
4481    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4482    bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4483 
4484    unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4485    ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4486    ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4487 
4488    /* Pad the shader with enough zero bytes to trick the prefetcher,
4489     * unless we're compiling an empty shader (in which case we don't pad
4490     * so the size remains 0) */
4491    unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4492 
4493    if (binary->size - offset) {
4494       memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
4495              prefetch_size);
4496    }
4497 }
4498 
4499 /*
4500  * Build a bit mask of varyings (by location) that are flatshaded. This
4501  * information is needed by lower_mediump_io, as we don't yet support 16-bit
4502  * flat varyings.
4503  *
4504  * Also varyings that are used as texture coordinates should be kept at fp32 so
4505  * the texture instruction may be promoted to VAR_TEX. In general this is a good
4506  * idea, as fp16 texture coordinates are not supported by the hardware and are
4507  * usually inappropriate. (There are both relevant CTS bugs here, even.)
4508  *
4509  * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4510  * we could implement 16-bit flat varyings. Consider if this case matters.
4511  *
4512  * TODO: The texture coordinate handling could be less heavyhanded.
4513  */
4514 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)4515 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4516 {
4517    uint64_t *mask = data;
4518 
4519    if (instr->type != nir_instr_type_tex)
4520       return false;
4521 
4522    nir_tex_instr *tex = nir_instr_as_tex(instr);
4523 
4524    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4525    if (coord_idx < 0)
4526       return false;
4527 
4528    nir_src src = tex->src[coord_idx].src;
4529    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
4530    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
4531 
4532    if (x.def != y.def)
4533       return false;
4534 
4535    nir_instr *parent = x.def->parent_instr;
4536 
4537    if (parent->type != nir_instr_type_intrinsic)
4538       return false;
4539 
4540    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4541 
4542    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4543       return false;
4544 
4545    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4546    *mask |= BITFIELD64_BIT(sem.location);
4547    return false;
4548 }
4549 
4550 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)4551 bi_fp32_varying_mask(nir_shader *nir)
4552 {
4553    uint64_t mask = 0;
4554 
4555    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4556 
4557    nir_foreach_shader_in_variable(var, nir) {
4558       if (var->data.interpolation == INTERP_MODE_FLAT)
4559          mask |= BITFIELD64_BIT(var->data.location);
4560    }
4561 
4562    nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
4563                                 &mask);
4564 
4565    return mask;
4566 }
4567 
4568 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)4569 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
4570                             void *data)
4571 {
4572    if (intr->intrinsic != nir_intrinsic_store_output)
4573       return false;
4574 
4575    assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
4576    if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
4577       return false;
4578 
4579    b->cursor = nir_before_instr(&intr->instr);
4580 
4581    nir_def *orig = nir_load_sample_mask(b);
4582 
4583    nir_src_rewrite(&intr->src[0],
4584                    nir_b32csel(b, nir_load_multisampled_pan(b),
4585                                nir_iand(b, orig, intr->src[0].ssa), orig));
4586    return true;
4587 }
4588 
4589 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4590 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
4591                      UNUSED void *data)
4592 {
4593    if (intr->intrinsic != nir_intrinsic_load_output)
4594       return false;
4595 
4596    unsigned loc = nir_intrinsic_io_semantics(intr).location;
4597    assert(loc >= FRAG_RESULT_DATA0);
4598    unsigned rt = loc - FRAG_RESULT_DATA0;
4599 
4600    b->cursor = nir_before_instr(&intr->instr);
4601 
4602    nir_def *conversion = nir_load_rt_conversion_pan(
4603       b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
4604 
4605    nir_def *lowered = nir_load_converted_output_pan(
4606       b, intr->def.num_components, intr->def.bit_size, conversion,
4607       .dest_type = nir_intrinsic_dest_type(intr),
4608       .io_semantics = nir_intrinsic_io_semantics(intr));
4609 
4610    nir_def_rewrite_uses(&intr->def, lowered);
4611    return true;
4612 }
4613 
4614 bool
bifrost_nir_lower_load_output(nir_shader * nir)4615 bifrost_nir_lower_load_output(nir_shader *nir)
4616 {
4617    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4618 
4619    return nir_shader_intrinsics_pass(
4620       nir, bi_lower_load_output,
4621       nir_metadata_block_index | nir_metadata_dominance, NULL);
4622 }
4623 
4624 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)4625 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
4626 {
4627    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
4628     * (so we don't accidentally duplicate the epilogue since mesa/st has
4629     * messed with our I/O quite a bit already) */
4630 
4631    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4632 
4633    if (nir->info.stage == MESA_SHADER_VERTEX) {
4634       NIR_PASS_V(nir, nir_lower_viewport_transform);
4635       NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
4636 
4637       nir_variable *psiz = nir_find_variable_with_location(
4638          nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
4639       if (psiz != NULL)
4640          psiz->data.precision = GLSL_PRECISION_MEDIUM;
4641    }
4642 
4643    /* lower MSAA load/stores to 3D load/stores */
4644    NIR_PASS_V(nir, pan_nir_lower_image_ms);
4645 
4646    /* Get rid of any global vars before we lower to scratch. */
4647    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
4648 
4649    /* Valhall introduces packed thread local storage, which improves cache
4650     * locality of TLS access. However, access to packed TLS cannot
4651     * straddle 16-byte boundaries. As such, when packed TLS is in use
4652     * (currently unconditional for Valhall), we force vec4 alignment for
4653     * scratch access.
4654     */
4655    bool packed_tls = (gpu_id >= 0x9000);
4656 
4657    /* Lower large arrays to scratch and small arrays to bcsel */
4658    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
4659               packed_tls ? glsl_get_vec4_size_align_bytes
4660                          : glsl_get_natural_size_align_bytes);
4661    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
4662 
4663    NIR_PASS_V(nir, nir_split_var_copies);
4664    NIR_PASS_V(nir, nir_lower_var_copies);
4665    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4666    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
4667               glsl_type_size, 0);
4668 
4669    /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
4670     * offsets it could figure out are constant.  Do some constant folding
4671     * before bifrost_nir_lower_store_component below.
4672     */
4673    NIR_PASS_V(nir, nir_opt_constant_folding);
4674 
4675    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4676       NIR_PASS_V(nir, nir_lower_mediump_io,
4677                  nir_var_shader_in | nir_var_shader_out,
4678                  ~bi_fp32_varying_mask(nir), false);
4679 
4680       NIR_PASS_V(nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
4681                  nir_metadata_block_index | nir_metadata_dominance, NULL);
4682 
4683       NIR_PASS_V(nir, bifrost_nir_lower_load_output);
4684    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
4685       if (gpu_id >= 0x9000) {
4686          NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
4687                     BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
4688       }
4689 
4690       NIR_PASS_V(nir, pan_nir_lower_store_component);
4691    }
4692 
4693    nir_lower_mem_access_bit_sizes_options mem_size_options = {
4694       .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_constant |
4695                nir_var_mem_task_payload | nir_var_shader_temp |
4696                nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
4697       .callback = mem_access_size_align_cb,
4698    };
4699    NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
4700 
4701    NIR_PASS_V(nir, nir_lower_ssbo);
4702    NIR_PASS_V(nir, pan_lower_sample_pos);
4703    NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
4704    NIR_PASS_V(nir, nir_lower_64bit_phis);
4705    NIR_PASS_V(nir, pan_lower_helper_invocation);
4706    NIR_PASS_V(nir, nir_lower_int64);
4707 
4708    NIR_PASS_V(nir, nir_opt_idiv_const, 8);
4709    NIR_PASS_V(nir, nir_lower_idiv,
4710               &(nir_lower_idiv_options){.allow_fp16 = true});
4711 
4712    NIR_PASS_V(nir, nir_lower_tex,
4713               &(nir_lower_tex_options){
4714                  .lower_txs_lod = true,
4715                  .lower_txp = ~0,
4716                  .lower_tg4_broadcom_swizzle = true,
4717                  .lower_txd = true,
4718                  .lower_invalid_implicit_lod = true,
4719                  .lower_index_to_offset = true,
4720               });
4721 
4722    NIR_PASS_V(nir, nir_lower_image_atomics_to_global);
4723    NIR_PASS_V(nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4724    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
4725    NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
4726    NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
4727    NIR_PASS_V(nir, nir_lower_var_copies);
4728    NIR_PASS_V(nir, nir_lower_alu);
4729    NIR_PASS_V(nir, nir_lower_frag_coord_to_pixel_coord);
4730 }
4731 
4732 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)4733 bi_compile_variant_nir(nir_shader *nir,
4734                        const struct panfrost_compile_inputs *inputs,
4735                        struct util_dynarray *binary, struct bi_shader_info info,
4736                        enum bi_idvs_mode idvs)
4737 {
4738    bi_context *ctx = rzalloc(NULL, bi_context);
4739 
4740    /* There may be another program in the dynarray, start at the end */
4741    unsigned offset = binary->size;
4742 
4743    ctx->inputs = inputs;
4744    ctx->nir = nir;
4745    ctx->stage = nir->info.stage;
4746    ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
4747    ctx->arch = inputs->gpu_id >> 12;
4748    ctx->info = info;
4749    ctx->idvs = idvs;
4750    ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
4751 
4752    if (idvs != BI_IDVS_NONE) {
4753       /* Specializing shaders for IDVS is destructive, so we need to
4754        * clone. However, the last (second) IDVS shader does not need
4755        * to be preserved so we can skip cloning that one.
4756        */
4757       if (offset == 0)
4758          ctx->nir = nir = nir_shader_clone(ctx, nir);
4759 
4760       NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs,
4761                  nir_metadata_block_index | nir_metadata_dominance, &idvs);
4762 
4763       /* After specializing, clean up the mess */
4764       bool progress = true;
4765 
4766       while (progress) {
4767          progress = false;
4768 
4769          NIR_PASS(progress, nir, nir_opt_dce);
4770          NIR_PASS(progress, nir, nir_opt_dead_cf);
4771       }
4772    }
4773 
4774    /* If nothing is pushed, all UBOs need to be uploaded */
4775    ctx->ubo_mask = ~0;
4776 
4777    list_inithead(&ctx->blocks);
4778 
4779    bool skip_internal = nir->info.internal;
4780    skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
4781 
4782    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4783       nir_print_shader(nir, stdout);
4784    }
4785 
4786    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
4787 
4788    nir_foreach_function_impl(impl, nir) {
4789       nir_index_blocks(impl);
4790 
4791       ctx->indexed_nir_blocks =
4792          rzalloc_array(ctx, bi_block *, impl->num_blocks);
4793 
4794       ctx->ssa_alloc += impl->ssa_alloc;
4795 
4796       emit_cf_list(ctx, &impl->body);
4797       bi_emit_phis_deferred(ctx);
4798       break; /* TODO: Multi-function shaders */
4799    }
4800 
4801    /* Index blocks now that we're done emitting */
4802    bi_foreach_block(ctx, block) {
4803       block->index = ctx->num_blocks++;
4804    }
4805 
4806    bi_validate(ctx, "NIR -> BIR");
4807 
4808    /* If the shader doesn't write any colour or depth outputs, it may
4809     * still need an ATEST at the very end! */
4810    bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
4811                            !ctx->emitted_atest && !bi_skip_atest(ctx, false);
4812 
4813    if (need_dummy_atest) {
4814       bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
4815       bi_builder b = bi_init_builder(ctx, bi_after_block(end));
4816       bi_emit_atest(&b, bi_zero());
4817    }
4818 
4819    bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
4820 
4821    /* Runs before constant folding */
4822    bi_lower_swizzle(ctx);
4823    bi_validate(ctx, "Early lowering");
4824 
4825    /* Runs before copy prop */
4826    if (optimize && !ctx->inputs->no_ubo_to_push) {
4827       bi_opt_push_ubo(ctx);
4828    }
4829 
4830    if (likely(optimize)) {
4831       bi_opt_copy_prop(ctx);
4832 
4833       while (bi_opt_constant_fold(ctx))
4834          bi_opt_copy_prop(ctx);
4835 
4836       bi_opt_mod_prop_forward(ctx);
4837       bi_opt_mod_prop_backward(ctx);
4838 
4839       /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
4840        * mod_prop_backward to fuse VAR_TEX */
4841       if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
4842           !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
4843          bi_opt_dead_code_eliminate(ctx);
4844          bi_opt_message_preload(ctx);
4845          bi_opt_copy_prop(ctx);
4846       }
4847 
4848       bi_opt_dead_code_eliminate(ctx);
4849       bi_opt_cse(ctx);
4850       bi_opt_dead_code_eliminate(ctx);
4851       if (!ctx->inputs->no_ubo_to_push)
4852          bi_opt_reorder_push(ctx);
4853       bi_validate(ctx, "Optimization passes");
4854    }
4855 
4856    bi_lower_opt_instructions(ctx);
4857 
4858    if (ctx->arch >= 9) {
4859       va_optimize(ctx);
4860       va_lower_isel(ctx);
4861 
4862       bi_foreach_instr_global_safe(ctx, I) {
4863          /* Phis become single moves so shouldn't be affected */
4864          if (I->op == BI_OPCODE_PHI)
4865             continue;
4866 
4867          va_lower_constants(ctx, I);
4868 
4869          bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
4870          va_repair_fau(&b, I);
4871       }
4872 
4873       /* We need to clean up after constant lowering */
4874       if (likely(optimize)) {
4875          bi_opt_cse(ctx);
4876          bi_opt_dead_code_eliminate(ctx);
4877       }
4878 
4879       bi_validate(ctx, "Valhall passes");
4880    }
4881 
4882    bi_foreach_block(ctx, block) {
4883       bi_lower_branch(ctx, block);
4884    }
4885 
4886    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4887       bi_print_shader(ctx, stdout);
4888 
4889    /* Analyze before register allocation to avoid false dependencies. The
4890     * skip bit is a function of only the data flow graph and is invariant
4891     * under valid scheduling. Helpers are only defined for fragment
4892     * shaders, so this analysis is only required in fragment shaders.
4893     */
4894    if (ctx->stage == MESA_SHADER_FRAGMENT)
4895       bi_analyze_helper_requirements(ctx);
4896 
4897    /* Fuse TEXC after analyzing helper requirements so the analysis
4898     * doesn't have to know about dual textures */
4899    if (likely(optimize)) {
4900       bi_opt_fuse_dual_texture(ctx);
4901    }
4902 
4903    /* Lower FAU after fusing dual texture, because fusing dual texture
4904     * creates new immediates that themselves may need lowering.
4905     */
4906    if (ctx->arch <= 8) {
4907       bi_lower_fau(ctx);
4908    }
4909 
4910    /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
4911    if (likely(optimize)) {
4912       bi_opt_cse(ctx);
4913       bi_opt_dead_code_eliminate(ctx);
4914    }
4915 
4916    bi_validate(ctx, "Late lowering");
4917 
4918    if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
4919       bi_pressure_schedule(ctx);
4920       bi_validate(ctx, "Pre-RA scheduling");
4921    }
4922 
4923    bi_register_allocate(ctx);
4924 
4925    if (likely(optimize))
4926       bi_opt_post_ra(ctx);
4927 
4928    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4929       bi_print_shader(ctx, stdout);
4930 
4931    if (ctx->arch >= 9) {
4932       va_assign_slots(ctx);
4933       va_insert_flow_control_nops(ctx);
4934       va_merge_flow(ctx);
4935       va_mark_last(ctx);
4936    } else {
4937       bi_schedule(ctx);
4938       bi_assign_scoreboard(ctx);
4939 
4940       /* Analyze after scheduling since we depend on instruction
4941        * order. Valhall calls as part of va_insert_flow_control_nops,
4942        * as the handling for clauses differs from instructions.
4943        */
4944       bi_analyze_helper_terminate(ctx);
4945       bi_mark_clauses_td(ctx);
4946    }
4947 
4948    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4949       bi_print_shader(ctx, stdout);
4950 
4951    if (ctx->arch <= 8) {
4952       bi_pack_clauses(ctx, binary, offset);
4953    } else {
4954       bi_pack_valhall(ctx, binary);
4955    }
4956 
4957    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4958       if (ctx->arch <= 8) {
4959          disassemble_bifrost(stdout, binary->data + offset,
4960                              binary->size - offset,
4961                              bifrost_debug & BIFROST_DBG_VERBOSE);
4962       } else {
4963          disassemble_valhall(stdout, binary->data + offset,
4964                              binary->size - offset,
4965                              bifrost_debug & BIFROST_DBG_VERBOSE);
4966       }
4967 
4968       fflush(stdout);
4969    }
4970 
4971    if (!skip_internal &&
4972        ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
4973       char *shaderdb;
4974 
4975       if (ctx->arch >= 9) {
4976          shaderdb = va_print_stats(ctx, binary->size - offset);
4977       } else {
4978          shaderdb = bi_print_stats(ctx, binary->size - offset);
4979       }
4980 
4981       if (bifrost_debug & BIFROST_DBG_SHADERDB)
4982          fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
4983 
4984       if (inputs->debug)
4985          util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
4986 
4987       ralloc_free(shaderdb);
4988    }
4989 
4990    return ctx;
4991 }
4992 
4993 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)4994 bi_compile_variant(nir_shader *nir,
4995                    const struct panfrost_compile_inputs *inputs,
4996                    struct util_dynarray *binary, struct pan_shader_info *info,
4997                    enum bi_idvs_mode idvs)
4998 {
4999    struct bi_shader_info local_info = {
5000       .push = &info->push,
5001       .bifrost = &info->bifrost,
5002       .tls_size = info->tls_size,
5003       .push_offset = info->push.count,
5004    };
5005 
5006    unsigned offset = binary->size;
5007 
5008    /* If there is no position shader (gl_Position is not written), then
5009     * there is no need to build a varying shader either. This case is hit
5010     * for transform feedback only vertex shaders which only make sense with
5011     * rasterizer discard.
5012     */
5013    if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5014       return;
5015 
5016    /* Software invariant: Only a secondary shader can appear at a nonzero
5017     * offset, to keep the ABI simple. */
5018    assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5019 
5020    bi_context *ctx =
5021       bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5022 
5023    /* A register is preloaded <==> it is live before the first block */
5024    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5025    uint64_t preload = first_block->reg_live_in;
5026 
5027    /* If multisampling is used with a blend shader, the blend shader needs
5028     * to access the sample coverage mask in r60 and the sample ID in r61.
5029     * Blend shaders run in the same context as fragment shaders, so if a
5030     * blend shader could run, we need to preload these registers
5031     * conservatively. There is believed to be little cost to doing so, so
5032     * do so always to avoid variants of the preload descriptor.
5033     *
5034     * We only do this on Valhall, as Bifrost has to update the RSD for
5035     * multisampling w/ blend shader anyway, so this is handled in the
5036     * driver. We could unify the paths if the cost is acceptable.
5037     */
5038    if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5039       preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5040 
5041    info->ubo_mask |= ctx->ubo_mask;
5042    info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5043 
5044    if (idvs == BI_IDVS_VARYING) {
5045       info->vs.secondary_enable = (binary->size > offset);
5046       info->vs.secondary_offset = offset;
5047       info->vs.secondary_preload = preload;
5048       info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5049    } else {
5050       info->preload = preload;
5051       info->work_reg_count = ctx->info.work_reg_count;
5052    }
5053 
5054    if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5055        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5056       /* Find the psiz write */
5057       bi_instr *write = NULL;
5058 
5059       bi_foreach_instr_global(ctx, I) {
5060          if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5061             write = I;
5062             break;
5063          }
5064       }
5065 
5066       assert(write != NULL);
5067 
5068       /* NOP it out, preserving its flow control. TODO: maybe DCE */
5069       if (write->flow) {
5070          bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5071          bi_instr *nop = bi_nop(&b);
5072          nop->flow = write->flow;
5073       }
5074 
5075       bi_remove_instruction(write);
5076 
5077       info->vs.no_psiz_offset = binary->size;
5078       bi_pack_valhall(ctx, binary);
5079    }
5080 
5081    ralloc_free(ctx);
5082 }
5083 
5084 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5085 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5086 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5087 {
5088    /* Opt-out */
5089    if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5090       return false;
5091 
5092    /* IDVS splits up vertex shaders, not defined on other shader stages */
5093    if (nir->info.stage != MESA_SHADER_VERTEX)
5094       return false;
5095 
5096    /* Bifrost cannot write gl_PointSize during IDVS */
5097    if ((inputs->gpu_id < 0x9000) &&
5098        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5099       return false;
5100 
5101    /* Otherwise, IDVS is usually better */
5102    return true;
5103 }
5104 
5105 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5106 bifrost_compile_shader_nir(nir_shader *nir,
5107                            const struct panfrost_compile_inputs *inputs,
5108                            struct util_dynarray *binary,
5109                            struct pan_shader_info *info)
5110 {
5111    bifrost_debug = debug_get_option_bifrost_debug();
5112 
5113    /* Combine stores late, to give the driver a chance to lower dual-source
5114     * blending as regular store_output intrinsics.
5115     */
5116    NIR_PASS_V(nir, pan_nir_lower_zs_store);
5117 
5118    bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5119 
5120    info->tls_size = nir->scratch_size;
5121    info->vs.idvs = bi_should_idvs(nir, inputs);
5122 
5123    pan_nir_collect_varyings(nir, info);
5124 
5125    if (info->vs.idvs) {
5126       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5127       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5128    } else {
5129       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5130    }
5131 
5132    if (gl_shader_stage_is_compute(nir->info.stage)) {
5133       /* Workgroups may be merged if the structure of the workgroup is
5134        * not software visible. This is true if neither shared memory
5135        * nor barriers are used. The hardware may be able to optimize
5136        * compute shaders that set this flag.
5137        */
5138       info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5139                                           !nir->info.uses_control_barrier &&
5140                                           !nir->info.uses_memory_barrier;
5141    }
5142 
5143    info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5144 }
5145