• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors (Collabora):
25  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26  */
27 
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/nir_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "compiler/nir/nir_schedule.h"
32 #include "util/u_debug.h"
33 
34 #include "disassemble.h"
35 #include "valhall/va_compiler.h"
36 #include "valhall/disassemble.h"
37 #include "bifrost_compile.h"
38 #include "compiler.h"
39 #include "valhall/va_compiler.h"
40 #include "bi_quirks.h"
41 #include "bi_builder.h"
42 #include "bifrost_nir.h"
43 
44 static const struct debug_named_value bifrost_debug_options[] = {
45         {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
46         {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
47         {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
48         {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
49         {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
50         {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
51         {"nopsched",  BIFROST_DBG_NOPSCHED,     "Disable scheduling for pressure"},
52         {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
53         {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
54         {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
55         {"noidvs",    BIFROST_DBG_NOIDVS,       "Disable IDVS"},
56         {"nosb",      BIFROST_DBG_NOSB,         "Disable scoreboarding"},
57         {"nopreload", BIFROST_DBG_NOPRELOAD,    "Disable message preloading"},
58         {"spill",     BIFROST_DBG_SPILL,        "Test register spilling"},
59         DEBUG_NAMED_VALUE_END
60 };
61 
62 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
63 
64 /* How many bytes are prefetched by the Bifrost shader core. From the final
65  * clause of the shader, this range must be valid instructions or zero. */
66 #define BIFROST_SHADER_PREFETCH 128
67 
68 int bifrost_debug = 0;
69 
70 #define DBG(fmt, ...) \
71 		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
72 			fprintf(stderr, "%s:%d: "fmt, \
73 				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
74 
75 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
76 
77 static bi_index
bi_preload(bi_builder * b,unsigned reg)78 bi_preload(bi_builder *b, unsigned reg)
79 {
80         if (bi_is_null(b->shader->preloaded[reg])) {
81                 /* Insert at the beginning of the shader */
82                 bi_builder b_ = *b;
83                 b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
84 
85                 /* Cache the result */
86                 b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
87         }
88 
89         return b->shader->preloaded[reg];
90 }
91 
92 static bi_index
bi_coverage(bi_builder * b)93 bi_coverage(bi_builder *b)
94 {
95         if (bi_is_null(b->shader->coverage))
96                 b->shader->coverage = bi_preload(b, 60);
97 
98         return b->shader->coverage;
99 }
100 
101 /*
102  * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
103  * changed from Bifrost to Valhall. Provide helpers that smooth over the
104  * architectural difference.
105  */
106 static inline bi_index
bi_vertex_id(bi_builder * b)107 bi_vertex_id(bi_builder *b)
108 {
109         return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
110 }
111 
112 static inline bi_index
bi_instance_id(bi_builder * b)113 bi_instance_id(bi_builder *b)
114 {
115         return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
116 }
117 
118 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)119 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
120 {
121         bi_instr *branch = bi_jump(b, bi_zero());
122 
123         switch (instr->type) {
124         case nir_jump_break:
125                 branch->branch_target = b->shader->break_block;
126                 break;
127         case nir_jump_continue:
128                 branch->branch_target = b->shader->continue_block;
129                 break;
130         default:
131                 unreachable("Unhandled jump type");
132         }
133 
134         bi_block_add_successor(b->shader->current_block, branch->branch_target);
135         b->shader->current_block->unconditional_jumps = true;
136 }
137 
138 /* Builds a 64-bit hash table key for an index */
139 static uint64_t
bi_index_to_key(bi_index idx)140 bi_index_to_key(bi_index idx)
141 {
142         static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
143 
144         uint64_t key = 0;
145         memcpy(&key, &idx, sizeof(idx));
146         return key;
147 }
148 
149 /*
150  * Extract a single channel out of a vector source. We split vectors with SPLIT
151  * so we can use the split components directly, without emitting an extract.
152  * This has advantages of RA, as the split can usually be optimized away.
153  */
154 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)155 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
156 {
157         /* Extract caching relies on SSA form. It is incorrect for nir_register.
158          * Bypass the cache and emit an explicit split for registers.
159          */
160         if (vec.reg) {
161                 bi_instr *I = bi_split_i32_to(b, bi_null(), vec);
162                 I->nr_dests = channel + 1;
163                 I->dest[channel] = bi_temp(b->shader);
164                 return I->dest[channel];
165         }
166 
167         bi_index *components =
168                 _mesa_hash_table_u64_search(b->shader->allocated_vec,
169                                             bi_index_to_key(vec));
170 
171         /* No extract needed for scalars.
172          *
173          * This is a bit imprecise, but actual bugs (missing splits for vectors)
174          * should be caught by the following assertion. It is too difficult to
175          * ensure bi_extract is only called for real vectors.
176          */
177         if (components == NULL && channel == 0)
178                 return vec;
179 
180         assert(components != NULL && "missing bi_cache_collect()");
181         return components[channel];
182 }
183 
184 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)185 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
186 {
187         /* Lifetime of a hash table entry has to be at least as long as the table */
188         bi_index *channels = ralloc_array(b->shader, bi_index, n);
189         memcpy(channels, s, sizeof(bi_index) * n);
190 
191         _mesa_hash_table_u64_insert(b->shader->allocated_vec,
192                                     bi_index_to_key(dst), channels);
193 }
194 
195 /*
196  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
197  * split pseudo-instruction.
198  *
199  * Pre-condition: dests is filled with bi_null().
200  */
201 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)202 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
203 {
204         /* Setup the destinations */
205         for (unsigned i = 0; i < n; ++i) {
206                 dests[i] = bi_temp(b->shader);
207         }
208 
209         /* Emit the split */
210         if (n == 1) {
211                 bi_mov_i32_to(b, dests[0], vec);
212         } else {
213                 bi_instr *I = bi_split_i32_to(b, dests[0], vec);
214                 I->nr_dests = n;
215 
216                 for (unsigned j = 1; j < n; ++j)
217                         I->dest[j] = dests[j];
218         }
219 }
220 
221 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)222 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
223 {
224         bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
225         bi_emit_split_i32(b, dests, vec, n);
226         bi_cache_collect(b, vec, dests, n);
227 }
228 
229 /*
230  * Emit and cache a split for a vector of a given bitsize. The vector may not be
231  * composed of 32-bit words, but it will be split at 32-bit word boundaries.
232  */
233 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)234 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
235 {
236         bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
237 }
238 
239 static void
bi_split_dest(bi_builder * b,nir_dest dest)240 bi_split_dest(bi_builder *b, nir_dest dest)
241 {
242         bi_emit_cached_split(b, bi_dest_index(&dest),
243                                 nir_dest_bit_size(dest) *
244                                 nir_dest_num_components(dest));
245 }
246 
247 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)248 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
249 {
250         /* Special case: COLLECT of a single value is a scalar move */
251         if (n == 1)
252                 return bi_mov_i32_to(b, dst, chan[0]);
253 
254         bi_instr *I = bi_collect_i32_to(b, dst);
255         I->nr_srcs = n;
256 
257         for (unsigned i = 0; i < n; ++i)
258                 I->src[i] = chan[i];
259 
260         bi_cache_collect(b, dst, chan, n);
261         return I;
262 }
263 
264 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)265 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
266 {
267         return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
268 }
269 
270 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)271 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
272 {
273         return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
274 }
275 
276 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)277 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
278 {
279         bi_index dst = bi_temp(b->shader);
280         bi_collect_v2i32_to(b, dst, s0, s1);
281         return dst;
282 }
283 
284 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)285 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
286 {
287         switch (intr->intrinsic) {
288         case nir_intrinsic_load_barycentric_centroid:
289         case nir_intrinsic_load_barycentric_sample:
290                 return bi_preload(b, 61);
291 
292         /* Need to put the sample ID in the top 16-bits */
293         case nir_intrinsic_load_barycentric_at_sample:
294                 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
295                                 bi_half(bi_src_index(&intr->src[0]), false));
296 
297         /* Interpret as 8:8 signed fixed point positions in pixels along X and
298          * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
299          * is the center of the pixel so we first fixup and then convert. For
300          * fp16 input:
301          *
302          * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
303          * f2i16((256 * (x, y)) + (128, 128)) =
304          * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
305          *
306          * For fp32 input, that lacks enough precision for MSAA 16x, but the
307          * idea is the same. FIXME: still doesn't pass
308          */
309         case nir_intrinsic_load_barycentric_at_offset: {
310                 bi_index offset = bi_src_index(&intr->src[0]);
311                 bi_index f16 = bi_null();
312                 unsigned sz = nir_src_bit_size(intr->src[0]);
313 
314                 if (sz == 16) {
315                         f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
316                                         bi_imm_f16(128.0));
317                 } else {
318                         assert(sz == 32);
319                         bi_index f[2];
320                         for (unsigned i = 0; i < 2; ++i) {
321                                 f[i] = bi_fadd_rscale_f32(b,
322                                                 bi_extract(b, offset, i),
323                                                 bi_imm_f32(0.5), bi_imm_u32(8),
324                                                 BI_SPECIAL_NONE);
325                         }
326 
327                         f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
328                 }
329 
330                 return bi_v2f16_to_v2s16(b, f16);
331         }
332 
333         case nir_intrinsic_load_barycentric_pixel:
334         default:
335                 return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
336         }
337 }
338 
339 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)340 bi_interp_for_intrinsic(nir_intrinsic_op op)
341 {
342         switch (op) {
343         case nir_intrinsic_load_barycentric_centroid:
344                 return BI_SAMPLE_CENTROID;
345         case nir_intrinsic_load_barycentric_sample:
346         case nir_intrinsic_load_barycentric_at_sample:
347                 return BI_SAMPLE_SAMPLE;
348         case nir_intrinsic_load_barycentric_at_offset:
349                 return BI_SAMPLE_EXPLICIT;
350         case nir_intrinsic_load_barycentric_pixel:
351         default:
352                 return BI_SAMPLE_CENTER;
353         }
354 }
355 
356 /* auto, 64-bit omitted */
357 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)358 bi_reg_fmt_for_nir(nir_alu_type T)
359 {
360         switch (T) {
361         case nir_type_float16: return BI_REGISTER_FORMAT_F16;
362         case nir_type_float32: return BI_REGISTER_FORMAT_F32;
363         case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
364         case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
365         case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
366         case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
367         default: unreachable("Invalid type for register format");
368         }
369 }
370 
371 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
372  * immediate to be used (which applies even if _IMM can't be used) */
373 
374 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)375 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
376 {
377         nir_src *offset = nir_get_io_offset_src(instr);
378 
379         if (!nir_src_is_const(*offset))
380                 return false;
381 
382         *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
383         return (*immediate) < max;
384 }
385 
386 static void
387 bi_make_vec_to(bi_builder *b, bi_index final_dst,
388                 bi_index *src,
389                 unsigned *channel,
390                 unsigned count,
391                 unsigned bitsize);
392 
393 /* Bifrost's load instructions lack a component offset despite operating in
394  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
395  * but they may be unavoidable with separate shaders in use. To solve this, we
396  * lower to a larger load and an explicit copy of the desired components. */
397 
398 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)399 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
400 {
401         unsigned component = nir_intrinsic_component(instr);
402         unsigned nr = instr->num_components;
403         unsigned total = nr + component;
404         unsigned bitsize = nir_dest_bit_size(instr->dest);
405 
406         assert(total <= 4 && "should be vec4");
407         bi_emit_cached_split(b, tmp, total * bitsize);
408 
409         if (component == 0)
410                 return;
411 
412         bi_index srcs[] = { tmp, tmp, tmp };
413         unsigned channels[] = { component, component + 1, component + 2 };
414 
415         bi_make_vec_to(b, bi_dest_index(&instr->dest),
416                        srcs, channels, nr, nir_dest_bit_size(instr->dest));
417 }
418 
419 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)420 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
421 {
422         nir_alu_type T = nir_intrinsic_dest_type(instr);
423         enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
424         nir_src *offset = nir_get_io_offset_src(instr);
425         unsigned component = nir_intrinsic_component(instr);
426         enum bi_vecsize vecsize = (instr->num_components + component - 1);
427         unsigned imm_index = 0;
428         unsigned base = nir_intrinsic_base(instr);
429         bool constant = nir_src_is_const(*offset);
430         bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
431         bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
432         bi_instr *I;
433 
434         if (immediate) {
435                 I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b),
436                                       bi_instance_id(b), regfmt, vecsize,
437                                       imm_index);
438         } else {
439                 bi_index idx = bi_src_index(&instr->src[0]);
440 
441                 if (constant)
442                         idx = bi_imm_u32(imm_index);
443                 else if (base != 0)
444                         idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
445 
446                 I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b),
447                                   idx, regfmt, vecsize);
448         }
449 
450         if (b->shader->arch >= 9)
451                 I->table = PAN_TABLE_ATTRIBUTE;
452 
453         bi_copy_component(b, instr, dest);
454 }
455 
456 /*
457  * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
458  * come later, sparsely packed. This handles both linked and separable shaders
459  * with a common code path, with minimal keying only for desktop GL. Each slot
460  * consumes 16 bytes (TODO: fp16, partial vectors).
461  */
462 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)463 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
464 {
465         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
466         uint32_t mask = ctx->inputs->fixed_varying_mask;
467 
468         if (sem.location >= VARYING_SLOT_VAR0) {
469                 unsigned nr_special = util_bitcount(mask);
470                 unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
471 
472                 return 16 * (nr_special + general_index);
473         } else {
474                 return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
475         }
476 }
477 
478 /*
479  * Compute the offset in bytes of a varying with an immediate offset, adding the
480  * offset to the base computed above. Convenience method.
481  */
482 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)483 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
484 {
485         nir_src *src = nir_get_io_offset_src(intr);
486         assert(nir_src_is_const(*src) && "assumes immediate offset");
487 
488         return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
489 }
490 
491 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)492 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
493 {
494         enum bi_sample sample = BI_SAMPLE_CENTER;
495         enum bi_update update = BI_UPDATE_STORE;
496         enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
497         bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
498         bi_index src0 = bi_null();
499 
500         unsigned component = nir_intrinsic_component(instr);
501         enum bi_vecsize vecsize = (instr->num_components + component - 1);
502         bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
503 
504         unsigned sz = nir_dest_bit_size(instr->dest);
505 
506         if (smooth) {
507                 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
508                 assert(parent);
509 
510                 sample = bi_interp_for_intrinsic(parent->intrinsic);
511                 src0 = bi_varying_src0_for_barycentric(b, parent);
512 
513                 assert(sz == 16 || sz == 32);
514                 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
515                         : BI_REGISTER_FORMAT_F32;
516         } else {
517                 assert(sz == 32);
518                 regfmt = BI_REGISTER_FORMAT_U32;
519 
520                 /* Valhall can't have bi_null() here, although the source is
521                  * logically unused for flat varyings
522                  */
523                 if (b->shader->arch >= 9)
524                         src0 = bi_preload(b, 61);
525 
526                 /* Gather info as we go */
527                 b->shader->info.bifrost->uses_flat_shading = true;
528         }
529 
530         enum bi_source_format source_format =
531                 smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
532 
533         nir_src *offset = nir_get_io_offset_src(instr);
534         unsigned imm_index = 0;
535         bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
536         bi_instr *I = NULL;
537 
538         if (b->shader->malloc_idvs && immediate) {
539                 /* Immediate index given in bytes. */
540                 bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt,
541                                      sample, source_format, update, vecsize,
542                                      bi_varying_offset(b->shader, instr));
543         } else if (immediate && smooth) {
544                 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
545                                      vecsize, imm_index);
546         } else if (immediate && !smooth) {
547                 I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
548                                           vecsize, imm_index);
549         } else {
550                 bi_index idx = bi_src_index(offset);
551                 unsigned base = nir_intrinsic_base(instr);
552 
553                 if (b->shader->malloc_idvs) {
554                         /* Index needs to be in bytes, but NIR gives the index
555                          * in slots. For now assume 16 bytes per element.
556                          */
557                         bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
558                         unsigned vbase = bi_varying_base_bytes(b->shader, instr);
559 
560                         if (vbase != 0)
561                                 idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
562 
563                         bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt,
564                                          sample, source_format, update,
565                                          vecsize);
566                 } else if (smooth) {
567                         if (base != 0)
568                                 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
569 
570                         I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
571                                          update, vecsize);
572                 } else {
573                         if (base != 0)
574                                 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
575 
576                         I = bi_ld_var_flat_to(b, dest, idx,
577                                               BI_FUNCTION_NONE, regfmt,
578                                               vecsize);
579                 }
580         }
581 
582         /* Valhall usually uses machine-allocated IDVS. If this is disabled, use
583          * a simple Midgard-style ABI.
584          */
585         if (b->shader->arch >= 9 && I != NULL)
586                 I->table = PAN_TABLE_ATTRIBUTE;
587 
588         bi_copy_component(b, instr, dest);
589 }
590 
591 static void
bi_make_vec16_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count)592 bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
593                 unsigned *channel, unsigned count)
594 {
595         bi_index srcs[BI_MAX_VEC];
596 
597         for (unsigned i = 0; i < count; i += 2) {
598                 bool next = (i + 1) < count;
599 
600                 unsigned chan = channel ? channel[i] : 0;
601                 unsigned nextc = next && channel ? channel[i + 1] : 0;
602 
603                 bi_index w0 = bi_extract(b, src[i], chan >> 1);
604                 bi_index w1 = next ? bi_extract(b, src[i + 1], nextc >> 1) : bi_zero();
605 
606                 bi_index h0 = bi_half(w0, chan & 1);
607                 bi_index h1 = bi_half(w1, nextc & 1);
608 
609                 if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
610                         srcs[i >> 1] = bi_mov_i32(b, w0);
611                 else if (bi_is_word_equiv(w0, w1))
612                         srcs[i >> 1] = bi_swz_v2i16(b, bi_swz_16(w0, chan & 1, nextc & 1));
613                 else
614                         srcs[i >> 1] = bi_mkvec_v2i16(b, h0, h1);
615         }
616 
617         bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, 2));
618 }
619 
620 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)621 bi_make_vec_to(bi_builder *b, bi_index dst,
622                 bi_index *src,
623                 unsigned *channel,
624                 unsigned count,
625                 unsigned bitsize)
626 {
627         if (bitsize == 32) {
628                 bi_index srcs[BI_MAX_VEC];
629 
630                 for (unsigned i = 0; i < count; ++i)
631                         srcs[i] = bi_extract(b, src[i], channel ? channel[i] : 0);
632 
633                 bi_emit_collect_to(b, dst, srcs, count);
634         } else if (bitsize == 16) {
635                 bi_make_vec16_to(b, dst, src, channel, count);
636         } else if (bitsize == 8 && count == 1) {
637                 bi_swz_v4i8_to(b, dst, bi_byte(
638                                         bi_extract(b, src[0], channel[0] >> 2),
639                                         channel[0] & 3));
640         } else {
641                 unreachable("8-bit mkvec not yet supported");
642         }
643 }
644 
645 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)646 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
647                 bi_index src1)
648 {
649         bi_instr *I;
650 
651         if (b->shader->arch >= 9) {
652                 I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
653                 I->seg = BI_SEG_UBO;
654         } else {
655                 I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
656         }
657 
658         bi_emit_cached_split(b, dest0, bitsize);
659         return I;
660 }
661 
662 static bi_instr *
bi_load_sysval_to(bi_builder * b,bi_index dest,int sysval,unsigned nr_components,unsigned offset)663 bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
664                 unsigned nr_components, unsigned offset)
665 {
666         unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ?
667                               b->shader->inputs->fixed_sysval_ubo :
668                               b->shader->nir->info.num_ubos;
669         unsigned uniform =
670                 pan_lookup_sysval(b->shader->sysval_to_id,
671                                   b->shader->info.sysvals,
672                                   sysval);
673         unsigned idx = (uniform * 16) + offset;
674 
675         return bi_load_ubo_to(b, nr_components * 32, dest,
676                               bi_imm_u32(idx), bi_imm_u32(sysval_ubo));
677 }
678 
679 static void
bi_load_sysval_nir(bi_builder * b,nir_intrinsic_instr * intr,unsigned nr_components,unsigned offset)680 bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
681                 unsigned nr_components, unsigned offset)
682 {
683         bi_load_sysval_to(b, bi_dest_index(&intr->dest),
684                         panfrost_sysval_for_instr(&intr->instr, NULL),
685                         nr_components, offset);
686 }
687 
688 static bi_index
bi_load_sysval(bi_builder * b,int sysval,unsigned nr_components,unsigned offset)689 bi_load_sysval(bi_builder *b, int sysval,
690                 unsigned nr_components, unsigned offset)
691 {
692         bi_index tmp = bi_temp(b->shader);
693         bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
694         return tmp;
695 }
696 
697 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)698 bi_load_sample_id_to(bi_builder *b, bi_index dst)
699 {
700         /* r61[16:23] contains the sampleID, mask it out. Upper bits
701          * seem to read garbage (despite being architecturally defined
702          * as zero), so use a 5-bit mask instead of 8-bits */
703 
704         bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
705                                 bi_imm_u8(16), false);
706 }
707 
708 static bi_index
bi_load_sample_id(bi_builder * b)709 bi_load_sample_id(bi_builder *b)
710 {
711         bi_index sample_id = bi_temp(b->shader);
712         bi_load_sample_id_to(b, sample_id);
713         return sample_id;
714 }
715 
716 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)717 bi_pixel_indices(bi_builder *b, unsigned rt)
718 {
719         /* We want to load the current pixel. */
720         struct bifrost_pixel_indices pix = {
721                 .y = BIFROST_CURRENT_PIXEL,
722                 .rt = rt
723         };
724 
725         uint32_t indices_u32 = 0;
726         memcpy(&indices_u32, &pix, sizeof(indices_u32));
727         bi_index indices = bi_imm_u32(indices_u32);
728 
729         /* Sample index above is left as zero. For multisampling, we need to
730          * fill in the actual sample ID in the lower byte */
731 
732         if (b->shader->inputs->blend.nr_samples > 1)
733                 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
734 
735         return indices;
736 }
737 
738 /* Source color is passed through r0-r3, or r4-r7 for the second source when
739  * dual-source blending. Preload the corresponding vector.
740  */
741 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)742 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
743 {
744         nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
745         unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
746         unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
747         assert(size == 16 || size == 32);
748 
749         bi_index srcs[] = {
750                 bi_preload(b, base + 0), bi_preload(b, base + 1),
751                 bi_preload(b, base + 2), bi_preload(b, base + 3)
752         };
753 
754         bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
755 }
756 
757 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)758 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
759                  bi_index rgba2, nir_alu_type T2, unsigned rt)
760 {
761         /* On Valhall, BLEND does not encode the return address */
762         bool bifrost = b->shader->arch <= 8;
763 
764         /* Reads 2 or 4 staging registers to cover the input */
765         unsigned size = nir_alu_type_get_type_size(T);
766         unsigned size_2 = nir_alu_type_get_type_size(T2);
767         unsigned sr_count = (size <= 16) ? 2 : 4;
768         unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
769         const struct panfrost_compile_inputs *inputs = b->shader->inputs;
770         uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
771         enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
772 
773         if (inputs->is_blend && inputs->blend.nr_samples > 1) {
774                 /* Conversion descriptor comes from the compile inputs, pixel
775                  * indices derived at run time based on sample ID */
776                 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
777                                 bi_imm_u32(blend_desc >> 32),
778                                 regfmt, BI_VECSIZE_V4);
779         } else if (b->shader->inputs->is_blend) {
780                 uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
781 
782                 /* Blend descriptor comes from the compile inputs */
783                 /* Put the result in r0 */
784 
785                 bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
786                                 bi_coverage(b),
787                                 bi_imm_u32(blend_desc),
788                                 bi_imm_u32(blend_desc >> 32),
789                                 bi_null(), regfmt, sr_count, 0);
790         } else {
791                 /* Blend descriptor comes from the FAU RAM. By convention, the
792                  * return address on Bifrost is stored in r48 and will be used
793                  * by the blend shader to jump back to the fragment shader */
794 
795                 bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
796                                 bi_coverage(b),
797                                 bi_fau(BIR_FAU_BLEND_0 + rt, false),
798                                 bi_fau(BIR_FAU_BLEND_0 + rt, true),
799                                 rgba2, regfmt, sr_count, sr_count_2);
800         }
801 
802         assert(rt < 8);
803         b->shader->info.bifrost->blend[rt].type = T;
804 
805         if (T2)
806                 b->shader->info.bifrost->blend_src1_type = T2;
807 }
808 
809 /* Blend shaders do not need to run ATEST since they are dependent on a
810  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
811  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
812  * there are no writes to the coverage mask. The latter two are satisfied for
813  * all blit shaders, so we just care about early-z, which blit shaders force
814  * iff they do not write depth or stencil */
815 
816 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)817 bi_skip_atest(bi_context *ctx, bool emit_zs)
818 {
819         return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
820 }
821 
822 static void
bi_emit_atest(bi_builder * b,bi_index alpha)823 bi_emit_atest(bi_builder *b, bi_index alpha)
824 {
825         bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha);
826         b->shader->emitted_atest = true;
827         b->shader->coverage = atest->dest[0];
828 
829         /* Pseudo-source to encode in the tuple */
830         atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
831 }
832 
833 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)834 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
835 {
836         bool combined = instr->intrinsic ==
837                 nir_intrinsic_store_combined_output_pan;
838 
839         unsigned writeout = combined ? nir_intrinsic_component(instr) :
840                 PAN_WRITEOUT_C;
841 
842         bool emit_blend = writeout & (PAN_WRITEOUT_C);
843         bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
844 
845         const nir_variable *var =
846                 nir_find_variable_with_driver_location(b->shader->nir,
847                                                        nir_var_shader_out, nir_intrinsic_base(instr));
848 
849         unsigned loc = var ? var->data.location : 0;
850 
851         bi_index src0 = bi_src_index(&instr->src[0]);
852 
853         /* By ISA convention, the coverage mask is stored in R60. The store
854          * itself will be handled by a subsequent ATEST instruction */
855         if (loc == FRAG_RESULT_SAMPLE_MASK) {
856                 bi_index orig = bi_coverage(b);
857                 bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
858                 bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
859 
860                 b->shader->coverage =
861                         bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
862                 return;
863         }
864 
865         /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
866          * value, but render target #0 might not be floating point. However the
867          * alpha value is only used for alpha-to-coverage, a stage which is
868          * skipped for pure integer framebuffers, so the issue is moot. */
869 
870         if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
871                 nir_alu_type T = nir_intrinsic_src_type(instr);
872 
873                 bi_index rgba = bi_src_index(&instr->src[0]);
874                 bi_index alpha =
875                         (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) :
876                         (T == nir_type_float32) ? bi_extract(b, rgba, 3) :
877                         bi_dontcare(b);
878 
879                 /* Don't read out-of-bounds */
880                 if (nir_src_num_components(instr->src[0]) < 4)
881                         alpha = bi_imm_f32(1.0);
882 
883                 bi_emit_atest(b, alpha);
884         }
885 
886         if (emit_zs) {
887                 bi_index z = bi_dontcare(b), s = bi_dontcare(b);
888 
889                 if (writeout & PAN_WRITEOUT_Z)
890                         z = bi_src_index(&instr->src[2]);
891 
892                 if (writeout & PAN_WRITEOUT_S)
893                         s = bi_src_index(&instr->src[3]);
894 
895                 b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
896                                                  writeout & PAN_WRITEOUT_S,
897                                                  writeout & PAN_WRITEOUT_Z);
898         }
899 
900         if (emit_blend) {
901                 unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
902                 bool dual = (writeout & PAN_WRITEOUT_2);
903                 bi_index color = bi_src_index(&instr->src[0]);
904                 bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
905                 nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
906 
907                 /* Explicit copy since BLEND inputs are precoloured to R0-R3,
908                  * TODO: maybe schedule around this or implement in RA as a
909                  * spill */
910                 bool has_mrt = false;
911 
912                 nir_foreach_shader_out_variable(var, b->shader->nir)
913                         has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
914 
915                 if (has_mrt) {
916                         bi_index srcs[4] = { color, color, color, color };
917                         unsigned channels[4] = { 0, 1, 2, 3 };
918                         color = bi_temp(b->shader);
919                         bi_make_vec_to(b, color, srcs, channels,
920                                        nir_src_num_components(instr->src[0]),
921                                        nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
922                 }
923 
924                 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr),
925                                     color2, T2, rt);
926         }
927 
928         if (b->shader->inputs->is_blend) {
929                 /* Jump back to the fragment shader, return address is stored
930                  * in r48 (see above). On Valhall, only jump if the address is
931                  * nonzero. The check is free there and it implements the "jump
932                  * to 0 terminates the blend shader" that's automatic on
933                  * Bifrost.
934                  */
935                 if (b->shader->arch >= 8)
936                         bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
937                 else
938                         bi_jump(b, bi_preload(b, 48));
939         }
940 }
941 
942 /**
943  * In a vertex shader, is the specified variable a position output? These kinds
944  * of outputs are written from position shaders when IDVS is enabled. All other
945  * outputs are written from the varying shader.
946  */
947 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)948 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
949 {
950         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
951 
952         switch (sem.location) {
953         case VARYING_SLOT_POS:
954         case VARYING_SLOT_PSIZ:
955                 return idvs == BI_IDVS_VARYING;
956         default:
957                 return idvs == BI_IDVS_POSITION;
958         }
959 }
960 
961 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)962 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
963 {
964         enum bi_idvs_mode *idvs = data;
965 
966         if (instr->type != nir_instr_type_intrinsic)
967                 return false;
968 
969         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
970 
971         if (intr->intrinsic != nir_intrinsic_store_output)
972                 return false;
973 
974         if (bi_should_remove_store(intr, *idvs)) {
975                 nir_instr_remove(instr);
976                 return  true;
977         }
978 
979         return false;
980 }
981 
982 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)983 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
984 {
985         /* In principle we can do better for 16-bit. At the moment we require
986          * 32-bit to permit the use of .auto, in order to force .u32 for flat
987          * varyings, to handle internal TGSI shaders that set flat in the VS
988          * but smooth in the FS */
989 
990         ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
991         ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
992         assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
993         enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
994 
995         unsigned imm_index = 0;
996         bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
997 
998         /* Only look at the total components needed. In effect, we fill in all
999          * the intermediate "holes" in the write mask, since we can't mask off
1000          * stores. Since nir_lower_io_to_temporaries ensures each varying is
1001          * written at most once, anything that's masked out is undefined, so it
1002          * doesn't matter what we write there. So we may as well do the
1003          * simplest thing possible. */
1004         unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1005         assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1006 
1007         bi_index data = bi_src_index(&instr->src[0]);
1008 
1009         /* To keep the vector dimensions consistent, we need to drop some
1010          * components. This should be coalesced.
1011          *
1012          * TODO: This is ugly and maybe inefficient. Would we rather
1013          * introduce a TRIM.i32 pseudoinstruction?
1014          */
1015         if (nr < nir_intrinsic_src_components(instr, 0)) {
1016                 assert(T_size == 32 && "todo: 16-bit trim");
1017 
1018                 bi_instr *split = bi_split_i32_to(b, bi_null(), data);
1019                 split->nr_dests = nir_intrinsic_src_components(instr, 0);
1020 
1021                 bi_index tmp = bi_temp(b->shader);
1022                 bi_instr *collect = bi_collect_i32_to(b, tmp);
1023                 collect->nr_srcs = nr;
1024 
1025                 for (unsigned w = 0; w < nr; ++w) {
1026                         split->dest[w] = bi_temp(b->shader);
1027                         collect->src[w] = split->dest[w];
1028                 }
1029 
1030                 data = tmp;
1031         }
1032 
1033         bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1034 
1035         bi_index a[4] = { bi_null() };
1036 
1037         if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1038                 /* Bifrost position shaders have a fast path */
1039                 assert(T == nir_type_float16 || T == nir_type_float32);
1040                 unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1041                 unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1042                 unsigned snap4 = 0x5E;
1043                 uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1044 
1045                 bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1046                           bi_imm_u32(format), regfmt, nr - 1);
1047         } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1048                 bi_index index = bi_preload(b, 59);
1049 
1050                 if (psiz) {
1051                         assert(T_size == 16 && "should've been lowered");
1052                         index = bi_iadd_imm_i32(b, index, 4);
1053                 }
1054 
1055                 bi_index address = bi_lea_buf_imm(b, index);
1056                 bi_emit_split_i32(b, a, address, 2);
1057 
1058                 bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1059 
1060                 bi_store(b, nr * nir_src_bit_size(instr->src[0]),
1061                          data, a[0], a[1],
1062                          varying ? BI_SEG_VARY : BI_SEG_POS,
1063                          varying ? bi_varying_offset(b->shader, instr) : 0);
1064         } else if (immediate) {
1065                 bi_index address = bi_lea_attr_imm(b,
1066                                           bi_vertex_id(b), bi_instance_id(b),
1067                                           regfmt, imm_index);
1068                 bi_emit_split_i32(b, a, address, 3);
1069 
1070                 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1071         } else {
1072                 bi_index idx =
1073                         bi_iadd_u32(b,
1074                                     bi_src_index(nir_get_io_offset_src(instr)),
1075                                     bi_imm_u32(nir_intrinsic_base(instr)),
1076                                     false);
1077                 bi_index address = bi_lea_attr(b,
1078                                       bi_vertex_id(b), bi_instance_id(b),
1079                                       idx, regfmt);
1080                 bi_emit_split_i32(b, a, address, 3);
1081 
1082                 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1083         }
1084 }
1085 
1086 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1087 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1088 {
1089         nir_src *offset = nir_get_io_offset_src(instr);
1090 
1091         bool offset_is_const = nir_src_is_const(*offset);
1092         bi_index dyn_offset = bi_src_index(offset);
1093         uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1094         bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
1095 
1096         bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
1097                         bi_dest_index(&instr->dest), offset_is_const ?
1098                         bi_imm_u32(const_offset) : dyn_offset,
1099                         kernel_input ? bi_zero() : bi_src_index(&instr->src[0]));
1100 }
1101 
1102 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1103 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1104 {
1105         assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1106 
1107         nir_src *offset = &instr->src[0];
1108         assert(nir_src_is_const(*offset) && "no indirect push constants");
1109         uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1110         assert((base & 3) == 0 && "unaligned push constants");
1111 
1112         unsigned bits = nir_dest_bit_size(instr->dest) *
1113                         nir_dest_num_components(instr->dest);
1114 
1115         unsigned n = DIV_ROUND_UP(bits, 32);
1116         assert(n <= 4);
1117         bi_index channels[4] = { bi_null() };
1118 
1119         for (unsigned i = 0; i < n; ++i) {
1120                 unsigned word = (base >> 2) + i;
1121 
1122                 channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1123         }
1124 
1125         bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n);
1126 }
1127 
1128 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1129 bi_addr_high(bi_builder *b, nir_src *src)
1130 {
1131 	return (nir_src_bit_size(*src) == 64) ?
1132 		bi_extract(b, bi_src_index(src), 1) : bi_zero();
1133 }
1134 
1135 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1136 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset)
1137 {
1138         /* Not needed on Bifrost or for global accesses */
1139         if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1140                 return;
1141 
1142         /* There is no segment modifier on Valhall. Instead, we need to
1143          * emit the arithmetic ourselves. We do have an offset
1144          * available, which saves an instruction for constant offsets.
1145          */
1146         bool wls = (seg == BI_SEG_WLS);
1147         assert(wls || (seg == BI_SEG_TL));
1148 
1149         enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1150 
1151         bi_index base_lo = bi_fau(fau, false);
1152 
1153         if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) {
1154                 *offset = addr_lo->value;
1155                 *addr_lo = base_lo;
1156         } else {
1157                 *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1158         }
1159 
1160         /* Do not allow overflow for WLS or TLS */
1161         *addr_hi = bi_fau(fau, true);
1162 }
1163 
1164 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1165 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1166 {
1167         int16_t offset = 0;
1168         unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest);
1169         bi_index dest = bi_dest_index(&instr->dest);
1170         bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1171         bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1172 
1173         bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1174 
1175         bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1176         bi_emit_cached_split(b, dest, bits);
1177 }
1178 
1179 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1180 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1181 {
1182         /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1183         assert(nir_intrinsic_write_mask(instr) ==
1184                         BITFIELD_MASK(instr->num_components));
1185 
1186         int16_t offset = 0;
1187         bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1188         bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1189 
1190         bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1191 
1192         bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1193                  bi_src_index(&instr->src[0]),
1194                  addr_lo, addr_hi, seg, offset);
1195 }
1196 
1197 /* Exchanges the staging register with memory */
1198 
1199 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1200 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
1201 {
1202         assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1203 
1204         unsigned sz = nir_src_bit_size(*arg);
1205         assert(sz == 32 || sz == 64);
1206 
1207         bi_index data = bi_src_index(arg);
1208 
1209         bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1210 
1211         if (b->shader->arch >= 9)
1212                 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1213         else if (seg == BI_SEG_WLS)
1214                 addr_hi = bi_zero();
1215 
1216         bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1217 }
1218 
1219 /* Exchanges the second staging register with memory if comparison with first
1220  * staging register passes */
1221 
1222 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1223 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
1224 {
1225         assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1226 
1227         /* hardware is swapped from NIR */
1228         bi_index src0 = bi_src_index(arg_2);
1229         bi_index src1 = bi_src_index(arg_1);
1230 
1231         unsigned sz = nir_src_bit_size(*arg_1);
1232         assert(sz == 32 || sz == 64);
1233 
1234         bi_index data_words[] = {
1235                 bi_extract(b, src0, 0),
1236                 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1237 
1238                 /* 64-bit */
1239                 bi_extract(b, src1, 0),
1240                 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1241         };
1242 
1243         bi_index in = bi_temp(b->shader);
1244         bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1245         bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1246 
1247         if (b->shader->arch >= 9)
1248                 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1249         else if (seg == BI_SEG_WLS)
1250                 addr_hi = bi_zero();
1251 
1252         bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1253         bi_emit_cached_split(b, out, sz);
1254 
1255         bi_index inout_words[] = {
1256                 bi_extract(b, out, 0),
1257                 sz == 64 ? bi_extract(b, out, 1) : bi_null()
1258         };
1259 
1260         bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1261 }
1262 
1263 /* Extracts an atomic opcode */
1264 
1265 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_intrinsic_op op)1266 bi_atom_opc_for_nir(nir_intrinsic_op op)
1267 {
1268         switch (op) {
1269         case nir_intrinsic_global_atomic_add:
1270         case nir_intrinsic_shared_atomic_add:
1271         case nir_intrinsic_image_atomic_add:
1272                 return BI_ATOM_OPC_AADD;
1273 
1274         case nir_intrinsic_global_atomic_imin:
1275         case nir_intrinsic_shared_atomic_imin:
1276         case nir_intrinsic_image_atomic_imin:
1277                 return BI_ATOM_OPC_ASMIN;
1278 
1279         case nir_intrinsic_global_atomic_umin:
1280         case nir_intrinsic_shared_atomic_umin:
1281         case nir_intrinsic_image_atomic_umin:
1282                 return BI_ATOM_OPC_AUMIN;
1283 
1284         case nir_intrinsic_global_atomic_imax:
1285         case nir_intrinsic_shared_atomic_imax:
1286         case nir_intrinsic_image_atomic_imax:
1287                 return BI_ATOM_OPC_ASMAX;
1288 
1289         case nir_intrinsic_global_atomic_umax:
1290         case nir_intrinsic_shared_atomic_umax:
1291         case nir_intrinsic_image_atomic_umax:
1292                 return BI_ATOM_OPC_AUMAX;
1293 
1294         case nir_intrinsic_global_atomic_and:
1295         case nir_intrinsic_shared_atomic_and:
1296         case nir_intrinsic_image_atomic_and:
1297                 return BI_ATOM_OPC_AAND;
1298 
1299         case nir_intrinsic_global_atomic_or:
1300         case nir_intrinsic_shared_atomic_or:
1301         case nir_intrinsic_image_atomic_or:
1302                 return BI_ATOM_OPC_AOR;
1303 
1304         case nir_intrinsic_global_atomic_xor:
1305         case nir_intrinsic_shared_atomic_xor:
1306         case nir_intrinsic_image_atomic_xor:
1307                 return BI_ATOM_OPC_AXOR;
1308 
1309         default:
1310                 unreachable("Unexpected computational atomic");
1311         }
1312 }
1313 
1314 /* Optimized unary atomics are available with an implied #1 argument */
1315 
1316 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1317 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1318 {
1319         /* Check we have a compatible constant */
1320         if (arg.type != BI_INDEX_CONSTANT)
1321                 return false;
1322 
1323         if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1324                 return false;
1325 
1326         /* Check for a compatible operation */
1327         switch (op) {
1328         case BI_ATOM_OPC_AADD:
1329                 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1330                 return true;
1331         case BI_ATOM_OPC_ASMAX:
1332                 *out = BI_ATOM_OPC_ASMAX1;
1333                 return true;
1334         case BI_ATOM_OPC_AUMAX:
1335                 *out = BI_ATOM_OPC_AUMAX1;
1336                 return true;
1337         case BI_ATOM_OPC_AOR:
1338                 *out = BI_ATOM_OPC_AOR1;
1339                 return true;
1340         default:
1341                 return false;
1342         }
1343 }
1344 
1345 /*
1346  * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1347  * translate between these forms (with MKVEC.v2i16).
1348  *
1349  * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1350  * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
1351  *
1352  * Our indexing needs to reflects this.
1353  */
1354 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array)1355 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1356                     unsigned coord_comps, bool is_array)
1357 {
1358         assert(coord_comps > 0 && coord_comps <= 3);
1359 
1360         if (src_idx == 0) {
1361                 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1362                         return bi_extract(b, coord, 0);
1363                 else
1364                         return bi_mkvec_v2i16(b,
1365                                               bi_half(bi_extract(b, coord, 0), false),
1366                                               bi_half(bi_extract(b, coord, 1), false));
1367         } else {
1368                 if (coord_comps == 3 && b->shader->arch >= 9)
1369                         return bi_mkvec_v2i16(b, bi_imm_u16(0),
1370                                               bi_half(bi_extract(b, coord, 2), false));
1371                 else if (coord_comps == 3)
1372                         return bi_extract(b, coord, 2);
1373                 else if (coord_comps == 2 && is_array)
1374                         return bi_extract(b, coord, 1);
1375                 else
1376                         return bi_zero();
1377         }
1378 }
1379 
1380 static bi_index
bi_emit_image_index(bi_builder * b,nir_intrinsic_instr * instr)1381 bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
1382 {
1383         nir_src src = instr->src[0];
1384         bi_index index = bi_src_index(&src);
1385         bi_context *ctx = b->shader;
1386 
1387         /* Images come after vertex attributes, so handle an explicit offset */
1388         unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
1389                 util_bitcount64(ctx->nir->info.inputs_read) : 0;
1390 
1391         if (offset == 0)
1392                 return index;
1393         else if (nir_src_is_const(src))
1394                 return bi_imm_u32(nir_src_as_uint(src) + offset);
1395         else
1396                 return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
1397 }
1398 
1399 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1400 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1401 {
1402         enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1403         unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1404         bool array = nir_intrinsic_image_array(instr);
1405         ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1406 
1407         bi_index coords = bi_src_index(&instr->src[1]);
1408         bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1409         bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1410         bi_index dest = bi_dest_index(&instr->dest);
1411         enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1412         enum bi_vecsize vecsize = instr->num_components - 1;
1413 
1414         /* TODO: MSAA */
1415         assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1416 
1417         if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1418                 bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
1419                                                 nir_src_as_uint(instr->src[0]));
1420 
1421                 I->table = PAN_TABLE_IMAGE;
1422         } else if (b->shader->arch >= 9) {
1423                 unreachable("Indirect images on Valhall not yet supported");
1424         } else {
1425                 bi_ld_attr_tex_to(b, dest, xy, zw,
1426                                   bi_emit_image_index(b, instr), regfmt,
1427                                   vecsize);
1428         }
1429 
1430         bi_split_dest(b, instr->dest);
1431 }
1432 
1433 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1434 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1435 {
1436         enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1437         bool array = nir_intrinsic_image_array(instr);
1438         ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
1439         unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1440 
1441         /* TODO: MSAA */
1442         assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
1443 
1444         enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
1445                 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
1446                 BI_REGISTER_FORMAT_AUTO;
1447 
1448         bi_index coords = bi_src_index(&instr->src[1]);
1449         bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1450         bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1451         bi_index dest = bi_temp(b->shader);
1452 
1453         if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1454                 bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
1455                                                 nir_src_as_uint(instr->src[0]));
1456 
1457                 I->table = PAN_TABLE_IMAGE;
1458         } else if (b->shader->arch >= 9) {
1459                 unreachable("Indirect images on Valhall not yet supported");
1460         } else {
1461                 bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1462                                 bi_emit_image_index(b, instr), type);
1463 
1464                 /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1465                  * our ABI has all images in the primary attribute table
1466                  */
1467                 I->table = BI_TABLE_ATTRIBUTE_1;
1468         }
1469 
1470         bi_emit_cached_split(b, dest, 3 * 32);
1471         return dest;
1472 }
1473 
1474 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1475 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1476 {
1477         bi_index a[4] = { bi_null() };
1478         bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1479 
1480         bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1481                      bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
1482                      instr->num_components - 1);
1483 }
1484 
1485 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_intrinsic_op intrinsic)1486 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
1487                 bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
1488 {
1489         enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
1490         enum bi_atom_opc post_opc = opc;
1491         bool bifrost = b->shader->arch <= 8;
1492 
1493         /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1494          * take any vector but can still output in RETURN mode */
1495         bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1496         unsigned sr_count = bifrost ? 2 : 1;
1497 
1498         /* Generate either ATOM or ATOM1 as required */
1499         if (bi_promote_atom_c1(opc, arg, &opc)) {
1500                 bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1501                                        bi_extract(b, addr, 1), opc, sr_count);
1502         } else {
1503                 bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1504                                       bi_extract(b, addr, 1), opc, sr_count);
1505         }
1506 
1507         if (bifrost) {
1508                 /* Post-process it */
1509                 bi_emit_cached_split_i32(b, tmp_dest, 2);
1510                 bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc);
1511         }
1512 }
1513 
1514 /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
1515  * gl_FragCoord.z = ld_vary(fragz)
1516  * gl_FragCoord.w = ld_vary(fragw)
1517  */
1518 
1519 static void
bi_emit_load_frag_coord(bi_builder * b,nir_intrinsic_instr * instr)1520 bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
1521 {
1522         bi_index src[4] = {};
1523 
1524         for (unsigned i = 0; i < 2; ++i) {
1525                 src[i] = bi_fadd_f32(b,
1526                                 bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
1527                                 bi_imm_f32(0.5f));
1528         }
1529 
1530         for (unsigned i = 0; i < 2; ++i) {
1531                 src[2 + i] = bi_ld_var_special(b, bi_zero(),
1532                                 BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1533                                 BI_UPDATE_CLOBBER,
1534                                 (i == 0) ? BI_VARYING_NAME_FRAG_Z :
1535                                         BI_VARYING_NAME_FRAG_W,
1536                                 BI_VECSIZE_NONE);
1537         }
1538 
1539         bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
1540 }
1541 
1542 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1543 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1544 {
1545         bi_index dest = bi_dest_index(&instr->dest);
1546         nir_alu_type T = nir_intrinsic_dest_type(instr);
1547         enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1548         unsigned rt = b->shader->inputs->blend.rt;
1549         unsigned size = nir_dest_bit_size(instr->dest);
1550         unsigned nr = instr->num_components;
1551 
1552         /* Get the render target */
1553         if (!b->shader->inputs->is_blend) {
1554                 const nir_variable *var =
1555                         nir_find_variable_with_driver_location(b->shader->nir,
1556                                         nir_var_shader_out, nir_intrinsic_base(instr));
1557                 unsigned loc = var->data.location;
1558                 assert(loc >= FRAG_RESULT_DATA0);
1559                 rt = (loc - FRAG_RESULT_DATA0);
1560         }
1561 
1562         bi_index desc = b->shader->inputs->is_blend ?
1563                 bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
1564                 b->shader->inputs->bifrost.static_rt_conv ?
1565                 bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
1566                 bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
1567 
1568         bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
1569                       regfmt, nr - 1);
1570         bi_emit_cached_split(b, dest, size * nr);
1571 }
1572 
1573 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1574 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1575 {
1576         bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
1577                 bi_dest_index(&instr->dest) : bi_null();
1578         gl_shader_stage stage = b->shader->stage;
1579 
1580         switch (instr->intrinsic) {
1581         case nir_intrinsic_load_barycentric_pixel:
1582         case nir_intrinsic_load_barycentric_centroid:
1583         case nir_intrinsic_load_barycentric_sample:
1584         case nir_intrinsic_load_barycentric_at_sample:
1585         case nir_intrinsic_load_barycentric_at_offset:
1586                 /* handled later via load_vary */
1587                 break;
1588         case nir_intrinsic_load_interpolated_input:
1589         case nir_intrinsic_load_input:
1590                 if (b->shader->inputs->is_blend)
1591                         bi_emit_load_blend_input(b, instr);
1592                 else if (stage == MESA_SHADER_FRAGMENT)
1593                         bi_emit_load_vary(b, instr);
1594                 else if (stage == MESA_SHADER_VERTEX)
1595                         bi_emit_load_attr(b, instr);
1596                 else
1597                         unreachable("Unsupported shader stage");
1598                 break;
1599 
1600         case nir_intrinsic_store_output:
1601                 if (stage == MESA_SHADER_FRAGMENT)
1602                         bi_emit_fragment_out(b, instr);
1603                 else if (stage == MESA_SHADER_VERTEX)
1604                         bi_emit_store_vary(b, instr);
1605                 else
1606                         unreachable("Unsupported shader stage");
1607                 break;
1608 
1609         case nir_intrinsic_store_combined_output_pan:
1610                 assert(stage == MESA_SHADER_FRAGMENT);
1611                 bi_emit_fragment_out(b, instr);
1612                 break;
1613 
1614         case nir_intrinsic_load_ubo:
1615         case nir_intrinsic_load_kernel_input:
1616                 bi_emit_load_ubo(b, instr);
1617                 break;
1618 
1619         case nir_intrinsic_load_push_constant:
1620                 bi_emit_load_push_constant(b, instr);
1621                 break;
1622 
1623         case nir_intrinsic_load_global:
1624         case nir_intrinsic_load_global_constant:
1625                 bi_emit_load(b, instr, BI_SEG_NONE);
1626                 break;
1627 
1628         case nir_intrinsic_store_global:
1629                 bi_emit_store(b, instr, BI_SEG_NONE);
1630                 break;
1631 
1632         case nir_intrinsic_load_scratch:
1633                 bi_emit_load(b, instr, BI_SEG_TL);
1634                 break;
1635 
1636         case nir_intrinsic_store_scratch:
1637                 bi_emit_store(b, instr, BI_SEG_TL);
1638                 break;
1639 
1640         case nir_intrinsic_load_shared:
1641                 bi_emit_load(b, instr, BI_SEG_WLS);
1642                 break;
1643 
1644         case nir_intrinsic_store_shared:
1645                 bi_emit_store(b, instr, BI_SEG_WLS);
1646                 break;
1647 
1648         /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
1649          * is illegal in fragment shaders */
1650         case nir_intrinsic_memory_barrier:
1651         case nir_intrinsic_memory_barrier_buffer:
1652         case nir_intrinsic_memory_barrier_image:
1653         case nir_intrinsic_memory_barrier_shared:
1654         case nir_intrinsic_group_memory_barrier:
1655                 break;
1656 
1657         case nir_intrinsic_control_barrier:
1658                 assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1659                 bi_barrier(b);
1660                 break;
1661 
1662         case nir_intrinsic_shared_atomic_add:
1663         case nir_intrinsic_shared_atomic_imin:
1664         case nir_intrinsic_shared_atomic_umin:
1665         case nir_intrinsic_shared_atomic_imax:
1666         case nir_intrinsic_shared_atomic_umax:
1667         case nir_intrinsic_shared_atomic_and:
1668         case nir_intrinsic_shared_atomic_or:
1669         case nir_intrinsic_shared_atomic_xor: {
1670                 assert(nir_src_bit_size(instr->src[1]) == 32);
1671 
1672                 bi_index addr = bi_src_index(&instr->src[0]);
1673                 bi_index addr_hi;
1674 
1675                 if (b->shader->arch >= 9) {
1676                         bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1677                         addr = bi_collect_v2i32(b, addr, addr_hi);
1678                 } else {
1679                         addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1680                         bi_emit_cached_split(b, addr, 64);
1681                 }
1682 
1683                 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
1684                                 instr->intrinsic);
1685                 bi_split_dest(b, instr->dest);
1686                 break;
1687         }
1688 
1689         case nir_intrinsic_image_atomic_add:
1690         case nir_intrinsic_image_atomic_imin:
1691         case nir_intrinsic_image_atomic_umin:
1692         case nir_intrinsic_image_atomic_imax:
1693         case nir_intrinsic_image_atomic_umax:
1694         case nir_intrinsic_image_atomic_and:
1695         case nir_intrinsic_image_atomic_or:
1696         case nir_intrinsic_image_atomic_xor:
1697                 assert(nir_src_bit_size(instr->src[3]) == 32);
1698 
1699                 bi_emit_atomic_i32_to(b, dst,
1700                                 bi_emit_lea_image(b, instr),
1701                                 bi_src_index(&instr->src[3]),
1702                                 instr->intrinsic);
1703                 bi_split_dest(b, instr->dest);
1704                 break;
1705 
1706         case nir_intrinsic_global_atomic_add:
1707         case nir_intrinsic_global_atomic_imin:
1708         case nir_intrinsic_global_atomic_umin:
1709         case nir_intrinsic_global_atomic_imax:
1710         case nir_intrinsic_global_atomic_umax:
1711         case nir_intrinsic_global_atomic_and:
1712         case nir_intrinsic_global_atomic_or:
1713         case nir_intrinsic_global_atomic_xor:
1714                 assert(nir_src_bit_size(instr->src[1]) == 32);
1715 
1716                 bi_emit_atomic_i32_to(b, dst,
1717                                 bi_src_index(&instr->src[0]),
1718                                 bi_src_index(&instr->src[1]),
1719                                 instr->intrinsic);
1720 
1721                 bi_split_dest(b, instr->dest);
1722                 break;
1723 
1724         case nir_intrinsic_image_load:
1725                 bi_emit_image_load(b, instr);
1726                 break;
1727 
1728         case nir_intrinsic_image_store:
1729                 bi_emit_image_store(b, instr);
1730                 break;
1731 
1732         case nir_intrinsic_global_atomic_exchange:
1733                 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1734                                 &instr->src[1], BI_SEG_NONE);
1735                 bi_split_dest(b, instr->dest);
1736                 break;
1737 
1738         case nir_intrinsic_image_atomic_exchange:
1739                 bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
1740                                 &instr->src[3], BI_SEG_NONE);
1741                 bi_split_dest(b, instr->dest);
1742                 break;
1743 
1744         case nir_intrinsic_shared_atomic_exchange:
1745                 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1746                                 &instr->src[1], BI_SEG_WLS);
1747                 bi_split_dest(b, instr->dest);
1748                 break;
1749 
1750         case nir_intrinsic_global_atomic_comp_swap:
1751                 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1752                                 &instr->src[1], &instr->src[2], BI_SEG_NONE);
1753                 bi_split_dest(b, instr->dest);
1754                 break;
1755 
1756         case nir_intrinsic_image_atomic_comp_swap:
1757                 bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
1758                                 &instr->src[3], &instr->src[4], BI_SEG_NONE);
1759                 bi_split_dest(b, instr->dest);
1760                 break;
1761 
1762         case nir_intrinsic_shared_atomic_comp_swap:
1763                 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1764                                 &instr->src[1], &instr->src[2], BI_SEG_WLS);
1765                 bi_split_dest(b, instr->dest);
1766                 break;
1767 
1768         case nir_intrinsic_load_frag_coord:
1769                 bi_emit_load_frag_coord(b, instr);
1770                 break;
1771 
1772         case nir_intrinsic_load_output:
1773                 bi_emit_ld_tile(b, instr);
1774                 break;
1775 
1776         case nir_intrinsic_discard_if:
1777                 bi_discard_b32(b, bi_src_index(&instr->src[0]));
1778                 break;
1779 
1780         case nir_intrinsic_discard:
1781                 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1782                 break;
1783 
1784         case nir_intrinsic_load_ssbo_address:
1785         case nir_intrinsic_load_xfb_address:
1786                 bi_load_sysval_nir(b, instr, 2, 0);
1787                 break;
1788 
1789         case nir_intrinsic_load_work_dim:
1790         case nir_intrinsic_load_num_vertices:
1791                 bi_load_sysval_nir(b, instr, 1, 0);
1792                 break;
1793 
1794         case nir_intrinsic_load_first_vertex:
1795                 bi_load_sysval_nir(b, instr, 1, 0);
1796                 break;
1797 
1798         case nir_intrinsic_load_base_vertex:
1799                 bi_load_sysval_nir(b, instr, 1, 4);
1800                 break;
1801 
1802         case nir_intrinsic_load_base_instance:
1803                 bi_load_sysval_nir(b, instr, 1, 8);
1804                 break;
1805 
1806         case nir_intrinsic_load_draw_id:
1807                 bi_load_sysval_nir(b, instr, 1, 0);
1808                 break;
1809 
1810         case nir_intrinsic_get_ssbo_size:
1811                 bi_load_sysval_nir(b, instr, 1, 8);
1812                 break;
1813 
1814         case nir_intrinsic_load_viewport_scale:
1815         case nir_intrinsic_load_viewport_offset:
1816         case nir_intrinsic_load_num_workgroups:
1817         case nir_intrinsic_load_workgroup_size:
1818                 bi_load_sysval_nir(b, instr, 3, 0);
1819                 break;
1820 
1821         case nir_intrinsic_image_size:
1822                 bi_load_sysval_nir(b, instr,
1823                                 nir_dest_num_components(instr->dest), 0);
1824                 break;
1825 
1826         case nir_intrinsic_load_blend_const_color_rgba:
1827                 bi_load_sysval_nir(b, instr,
1828                                    nir_dest_num_components(instr->dest), 0);
1829                 break;
1830 
1831 	case nir_intrinsic_load_sample_positions_pan:
1832                 bi_collect_v2i32_to(b, dst,
1833                                     bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1834                                     bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1835                 break;
1836 
1837 	case nir_intrinsic_load_sample_mask_in:
1838                 /* r61[0:15] contains the coverage bitmap */
1839                 bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1840                 break;
1841 
1842         case nir_intrinsic_load_sample_id:
1843                 bi_load_sample_id_to(b, dst);
1844                 break;
1845 
1846 	case nir_intrinsic_load_front_face:
1847                 /* r58 == 0 means primitive is front facing */
1848                 bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1849                                 BI_RESULT_TYPE_M1);
1850                 break;
1851 
1852         case nir_intrinsic_load_point_coord:
1853                 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1854                                 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1855                                 BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1856                 bi_emit_cached_split_i32(b, dst, 2);
1857                 break;
1858 
1859         /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1860          * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly
1861          * we support the legacy geometry flow even on Valhall, so
1862          * vertex_id_zero_based isn't a machine property for us. Don't set it,
1863          * and lower here if needed.
1864          */
1865         case nir_intrinsic_load_vertex_id:
1866                 if (b->shader->malloc_idvs) {
1867                         bi_mov_i32_to(b, dst, bi_vertex_id(b));
1868                 } else {
1869                         bi_index first = bi_load_sysval(b,
1870                                                         PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS,
1871                                                         1, 0);
1872 
1873                         bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false);
1874                 }
1875 
1876                 break;
1877 
1878         /* We only use in our transform feedback lowering */
1879         case nir_intrinsic_load_vertex_id_zero_base:
1880                 assert(b->shader->nir->info.has_transform_feedback_varyings);
1881                 bi_mov_i32_to(b, dst, bi_vertex_id(b));
1882                 break;
1883 
1884         case nir_intrinsic_load_instance_id:
1885                 bi_mov_i32_to(b, dst, bi_instance_id(b));
1886                 break;
1887 
1888         case nir_intrinsic_load_subgroup_invocation:
1889                 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1890                 break;
1891 
1892         case nir_intrinsic_load_local_invocation_id:
1893                 bi_collect_v3i32_to(b, dst,
1894                                     bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1895                                     bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1896                                     bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1897                 break;
1898 
1899         case nir_intrinsic_load_workgroup_id:
1900                 bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1901                                     bi_preload(b, 59));
1902                 break;
1903 
1904         case nir_intrinsic_load_global_invocation_id:
1905         case nir_intrinsic_load_global_invocation_id_zero_base:
1906                 bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1907                                     bi_preload(b, 62));
1908                 break;
1909 
1910         case nir_intrinsic_shader_clock:
1911                 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1912                 bi_split_dest(b, instr->dest);
1913                 break;
1914 
1915         default:
1916                 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
1917                 assert(0);
1918         }
1919 }
1920 
1921 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)1922 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1923 {
1924         /* Make sure we've been lowered */
1925         assert(instr->def.num_components <= (32 / instr->def.bit_size));
1926 
1927         /* Accumulate all the channels of the constant, as if we did an
1928          * implicit SEL over them */
1929         uint32_t acc = 0;
1930 
1931         for (unsigned i = 0; i < instr->def.num_components; ++i) {
1932                 unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1933                 acc |= (v << (i * instr->def.bit_size));
1934         }
1935 
1936         bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
1937 }
1938 
1939 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)1940 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
1941 {
1942         /* we don't lower modifiers until the backend */
1943         assert(!(src.negate || src.abs));
1944 
1945         unsigned bitsize = nir_src_bit_size(src.src);
1946 
1947         /* the bi_index carries the 32-bit (word) offset separate from the
1948          * subword swizzle, first handle the offset */
1949 
1950         unsigned offset = 0;
1951 
1952         assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1953         unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1954 
1955         for (unsigned i = 0; i < comps; ++i) {
1956                 unsigned new_offset = (src.swizzle[i] >> subword_shift);
1957 
1958                 if (i > 0)
1959                         assert(offset == new_offset && "wrong vectorization");
1960 
1961                 offset = new_offset;
1962         }
1963 
1964         bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
1965 
1966         /* Compose the subword swizzle with existing (identity) swizzle */
1967         assert(idx.swizzle == BI_SWIZZLE_H01);
1968 
1969         /* Bigger vectors should have been lowered */
1970         assert(comps <= (1 << subword_shift));
1971 
1972         if (bitsize == 16) {
1973                 unsigned c0 = src.swizzle[0] & 1;
1974                 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1975                 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1976         } else if (bitsize == 8) {
1977                 /* 8-bit vectors not yet supported */
1978                 assert(comps == 1 && "8-bit vectors not supported");
1979                 assert(src.swizzle[0] < 4 && "8-bit vectors not supported");
1980                 idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0];
1981         }
1982 
1983         return idx;
1984 }
1985 
1986 static enum bi_round
bi_nir_round(nir_op op)1987 bi_nir_round(nir_op op)
1988 {
1989         switch (op) {
1990         case nir_op_fround_even: return BI_ROUND_NONE;
1991         case nir_op_ftrunc: return BI_ROUND_RTZ;
1992         case nir_op_fceil: return BI_ROUND_RTP;
1993         case nir_op_ffloor: return BI_ROUND_RTN;
1994         default: unreachable("invalid nir round op");
1995         }
1996 }
1997 
1998 /* Convenience for lowered transcendentals */
1999 
2000 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)2001 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2002 {
2003         return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2004 }
2005 
2006 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2007  * Newton-Raphson to improve precision */
2008 
2009 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)2010 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2011 {
2012         bi_index x1 = bi_frcp_approx_f32(b, s0);
2013         bi_index m  = bi_frexpm_f32(b, s0, false, false);
2014         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
2015         bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
2016                         bi_zero(), BI_SPECIAL_N);
2017         bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2018 }
2019 
2020 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)2021 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2022 {
2023         bi_index x1 = bi_frsq_approx_f32(b, s0);
2024         bi_index m  = bi_frexpm_f32(b, s0, false, true);
2025         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
2026         bi_index t1 = bi_fmul_f32(b, x1, x1);
2027         bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2028                         bi_imm_u32(-1), BI_SPECIAL_N);
2029         bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2030 }
2031 
2032 /* More complex transcendentals, see
2033  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2034  * for documentation */
2035 
2036 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)2037 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2038 {
2039         bi_index t1 = bi_temp(b->shader);
2040         bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2041         t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2042 
2043         bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2044 
2045         bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2046         a2->clamp = BI_CLAMP_CLAMP_M1_1;
2047 
2048         bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2049         bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2050         bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2051         bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2052                         bi_imm_u32(0x3e75fffa));
2053         bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2054         bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2055         bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
2056                         p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
2057         x->clamp = BI_CLAMP_CLAMP_0_INF;
2058 
2059         bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2060         max->sem = BI_SEM_NAN_PROPAGATE;
2061 }
2062 
2063 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)2064 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2065 {
2066         /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2067          * fixed-point input */
2068         bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2069                         bi_imm_u32(24), BI_SPECIAL_NONE);
2070         bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2071         fixed_pt->round = BI_ROUND_NONE; // XXX
2072 
2073         /* Compute the result for the fixed-point input, but pass along
2074          * the floating-point scale for correct NaN propagation */
2075         bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2076 }
2077 
2078 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2079 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2080 {
2081         /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2082         bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2083         bi_index ei = bi_frexpe_f32(b, s0, true, false);
2084         bi_index ef = bi_s32_to_f32(b, ei);
2085 
2086         /* xt estimates -log(r1), a coarse approximation of log(a1) */
2087         bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2088         bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2089 
2090         /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2091          * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2092          * and then log(s0) = x1 + x2 */
2093         bi_index x1 = bi_fadd_f32(b, ef, xt);
2094 
2095         /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2096          * polynomial approximation around 1. The series is expressed around
2097          * 1, so set y = (a1 * r1) - 1.0 */
2098         bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2099 
2100         /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2101          * log_e(1 + y) by the Taylor series (lower precision than the blob):
2102          * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2103         bi_index loge = bi_fmul_f32(b, y,
2104                 bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2105 
2106         bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2107 
2108         /* log(s0) = x1 + x2 */
2109         bi_fadd_f32_to(b, dst, x1, x2);
2110 }
2111 
2112 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2113 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2114 {
2115         bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2116         bi_index frexpi = bi_s32_to_f32(b, frexp);
2117         bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2118         bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2119 }
2120 
2121 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2122 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2123 {
2124         bi_index log2_base = bi_null();
2125 
2126         if (base.type == BI_INDEX_CONSTANT) {
2127                 log2_base = bi_imm_f32(log2f(uif(base.value)));
2128         } else {
2129                 log2_base = bi_temp(b->shader);
2130                 bi_lower_flog2_32(b, log2_base, base);
2131         }
2132 
2133         return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2134 }
2135 
2136 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2137 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2138 {
2139         bi_index log2_base = bi_null();
2140 
2141         if (base.type == BI_INDEX_CONSTANT) {
2142                 log2_base = bi_imm_f32(log2f(uif(base.value)));
2143         } else {
2144                 log2_base = bi_temp(b->shader);
2145                 bi_flog2_32(b, log2_base, base);
2146         }
2147 
2148         return bi_fexp_32(b, dst, exp, log2_base);
2149 }
2150 
2151 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2152  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2153  * calculates the results. We use them to calculate sin/cos via a Taylor
2154  * approximation:
2155  *
2156  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2157  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2158  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2159  */
2160 
2161 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
2162 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2163 #define SINCOS_BIAS  bi_imm_u32(0x49400000)
2164 
2165 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)2166 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2167 {
2168         /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2169         bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2170 
2171         /* Approximate domain error (small) */
2172         bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2173                         MPI_OVER_TWO, s0);
2174 
2175         /* Lookup sin(x), cos(x) */
2176         bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2177         bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2178 
2179         /* e^2 / 2 */
2180         bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
2181                         bi_imm_u32(-1), BI_SPECIAL_NONE);
2182 
2183         /* (-e^2)/2 f''(x) */
2184         bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
2185                         cos ? cosx : sinx,
2186                         bi_negzero());
2187 
2188         /* e f'(x) - (e^2/2) f''(x) */
2189         bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2190                         cos ? bi_neg(sinx) : cosx,
2191                         quadratic);
2192         I->clamp = BI_CLAMP_CLAMP_M1_1;
2193 
2194         /* f(x) + e f'(x) - (e^2/2) f''(x) */
2195         bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2196 }
2197 
2198 /*
2199  * The XOR lane op is useful for derivative calculations, but not all Bifrost
2200  * implementations have it. Add a safe helper that uses the hardware
2201  * functionality when available and lowers where unavailable.
2202  */
2203 static bi_index
bi_clper_xor(bi_builder * b,bi_index s0,bi_index s1)2204 bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
2205 {
2206         if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
2207                 return bi_clper_i32(b, s0, s1,
2208                                 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
2209                                 BI_SUBGROUP_SUBGROUP4);
2210         }
2211 
2212         bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
2213         bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
2214         return bi_clper_old_i32(b, s0, lane);
2215 }
2216 
2217 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2218 bi_translate_cmpf(nir_op op)
2219 {
2220         switch (op) {
2221         case nir_op_ieq8:
2222         case nir_op_ieq16:
2223         case nir_op_ieq32:
2224         case nir_op_feq16:
2225         case nir_op_feq32:
2226                 return BI_CMPF_EQ;
2227 
2228         case nir_op_ine8:
2229         case nir_op_ine16:
2230         case nir_op_ine32:
2231         case nir_op_fneu16:
2232         case nir_op_fneu32:
2233                 return BI_CMPF_NE;
2234 
2235         case nir_op_ilt8:
2236         case nir_op_ilt16:
2237         case nir_op_ilt32:
2238         case nir_op_flt16:
2239         case nir_op_flt32:
2240         case nir_op_ult8:
2241         case nir_op_ult16:
2242         case nir_op_ult32:
2243                 return BI_CMPF_LT;
2244 
2245         case nir_op_ige8:
2246         case nir_op_ige16:
2247         case nir_op_ige32:
2248         case nir_op_fge16:
2249         case nir_op_fge32:
2250         case nir_op_uge8:
2251         case nir_op_uge16:
2252         case nir_op_uge32:
2253                 return BI_CMPF_GE;
2254 
2255         default:
2256                 unreachable("invalid comparison");
2257         }
2258 }
2259 
2260 static bool
bi_nir_is_replicated(nir_alu_src * src)2261 bi_nir_is_replicated(nir_alu_src *src)
2262 {
2263         for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2264                 if (src->swizzle[0] == src->swizzle[i])
2265                         return false;
2266         }
2267 
2268         return true;
2269 }
2270 
2271 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2272 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2273 {
2274         bi_index dst = bi_dest_index(&instr->dest.dest);
2275         unsigned srcs = nir_op_infos[instr->op].num_inputs;
2276         unsigned sz = nir_dest_bit_size(instr->dest.dest);
2277         unsigned comps = nir_dest_num_components(instr->dest.dest);
2278         unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2279 
2280         /* Indicate scalarness */
2281         if (sz == 16 && comps == 1)
2282                 dst.swizzle = BI_SWIZZLE_H00;
2283 
2284         if (!instr->dest.dest.is_ssa) {
2285                 for (unsigned i = 0; i < comps; ++i)
2286                         assert(instr->dest.write_mask);
2287         }
2288 
2289         /* First, match against the various moves in NIR. These are
2290          * special-cased because they can operate on vectors even after
2291          * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2292          * instruction is no "bigger" than SIMD-within-a-register. These moves
2293          * are the exceptions that need to handle swizzles specially. */
2294 
2295         switch (instr->op) {
2296         case nir_op_vec2:
2297         case nir_op_vec3:
2298         case nir_op_vec4: {
2299                 bi_index unoffset_srcs[4] = {
2300                         srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
2301                         srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
2302                         srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
2303                         srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
2304                 };
2305 
2306                 unsigned channels[4] = {
2307                         instr->src[0].swizzle[0],
2308                         instr->src[1].swizzle[0],
2309                         srcs > 2 ? instr->src[2].swizzle[0] : 0,
2310                         srcs > 3 ? instr->src[3].swizzle[0] : 0,
2311                 };
2312 
2313                 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2314                 return;
2315         }
2316 
2317         case nir_op_vec8:
2318         case nir_op_vec16:
2319                 unreachable("should've been lowered");
2320 
2321         case nir_op_unpack_32_2x16: {
2322                 /* Should have been scalarized */
2323                 assert(comps == 2 && sz == 16);
2324 
2325                 bi_index vec = bi_src_index(&instr->src[0].src);
2326                 unsigned chan = instr->src[0].swizzle[0];
2327 
2328                 bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2329                 return;
2330         }
2331 
2332         case nir_op_unpack_64_2x32_split_x:
2333                 bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 0));
2334                 return;
2335 
2336         case nir_op_unpack_64_2x32_split_y:
2337                 bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 1));
2338                 return;
2339 
2340         case nir_op_pack_64_2x32_split:
2341                 bi_collect_v2i32_to(b, dst,
2342                                     bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]),
2343                                     bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0]));
2344                 return;
2345 
2346         case nir_op_pack_64_2x32:
2347                 bi_collect_v2i32_to(b, dst,
2348                                     bi_extract(b, bi_src_index(&instr->src[0].src), 0),
2349                                     bi_extract(b, bi_src_index(&instr->src[0].src), 1));
2350                 return;
2351 
2352         case nir_op_pack_uvec2_to_uint: {
2353                 bi_index src = bi_src_index(&instr->src[0].src);
2354 
2355                 assert(sz == 32 && src_sz == 32);
2356                 bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false),
2357                                           bi_half(bi_extract(b, src, 1), false));
2358                 return;
2359         }
2360 
2361         case nir_op_pack_uvec4_to_uint: {
2362                 bi_index src = bi_src_index(&instr->src[0].src);
2363 
2364                 assert(sz == 32 && src_sz == 32);
2365                 bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0),
2366                                          bi_byte(bi_extract(b, src, 1), 0),
2367                                          bi_byte(bi_extract(b, src, 2), 0),
2368                                          bi_byte(bi_extract(b, src, 3), 0));
2369                 return;
2370         }
2371 
2372         case nir_op_mov: {
2373                 bi_index idx = bi_src_index(&instr->src[0].src);
2374                 bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
2375 
2376                 unsigned channels[4] = {
2377                         comps > 0 ? instr->src[0].swizzle[0] : 0,
2378                         comps > 1 ? instr->src[0].swizzle[1] : 0,
2379                         comps > 2 ? instr->src[0].swizzle[2] : 0,
2380                         comps > 3 ? instr->src[0].swizzle[3] : 0,
2381                 };
2382 
2383                 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2384                 return;
2385         }
2386 
2387         case nir_op_pack_32_2x16: {
2388                 assert(nir_src_num_components(instr->src[0].src) == 2);
2389                 assert(comps == 1);
2390 
2391                 bi_index idx = bi_src_index(&instr->src[0].src);
2392                 bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
2393 
2394                 unsigned channels[2] = {
2395                         instr->src[0].swizzle[0],
2396                         instr->src[0].swizzle[1]
2397                 };
2398 
2399                 bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2400                 return;
2401         }
2402 
2403         case nir_op_f2f16:
2404         case nir_op_f2f16_rtz:
2405         case nir_op_f2f16_rtne: {
2406                 assert(src_sz == 32);
2407                 bi_index idx = bi_src_index(&instr->src[0].src);
2408                 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2409                 bi_index s1 = comps > 1 ?
2410                         bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2411 
2412                 bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2413 
2414                 /* Override rounding if explicitly requested. Otherwise, the
2415                  * default rounding mode is selected by the builder. Depending
2416                  * on the float controls required by the shader, the default
2417                  * mode may not be nearest-even.
2418                  */
2419                 if (instr->op == nir_op_f2f16_rtz)
2420                         I->round = BI_ROUND_RTZ;
2421                 else if (instr->op == nir_op_f2f16_rtne)
2422                         I->round = BI_ROUND_NONE; /* Nearest even */
2423 
2424                 return;
2425         }
2426 
2427         /* Vectorized downcasts */
2428         case nir_op_u2u16:
2429         case nir_op_i2i16: {
2430                 if (!(src_sz == 32 && comps == 2))
2431                         break;
2432 
2433                 bi_index idx = bi_src_index(&instr->src[0].src);
2434                 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2435                 bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2436 
2437                 bi_mkvec_v2i16_to(b, dst,
2438                                 bi_half(s0, false), bi_half(s1, false));
2439                 return;
2440         }
2441 
2442         /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2443          * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2444          * scalarizing due to scheduling (equal cost on Valhall). Additionally
2445          * if the source is replicated the MKVEC.v2i16 can be optimized out.
2446          */
2447         case nir_op_u2f16:
2448         case nir_op_i2f16: {
2449                 if (!(src_sz == 32 && comps == 2))
2450                         break;
2451 
2452                 nir_alu_src *src = &instr->src[0];
2453                 bi_index idx = bi_src_index(&src->src);
2454                 bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2455                 bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2456 
2457                 bi_index t = (src->swizzle[0] == src->swizzle[1]) ?
2458                         bi_half(s0, false) :
2459                         bi_mkvec_v2i16(b, bi_half(s0, false),
2460                                           bi_half(s1, false));
2461 
2462                 if (instr->op == nir_op_u2f16)
2463                         bi_v2u16_to_v2f16_to(b, dst, t);
2464                 else
2465                         bi_v2s16_to_v2f16_to(b, dst, t);
2466 
2467                 return;
2468         }
2469 
2470         case nir_op_i2i8:
2471         case nir_op_u2u8:
2472         {
2473                 /* Acts like an 8-bit swizzle */
2474                 bi_index idx = bi_src_index(&instr->src[0].src);
2475                 unsigned factor = src_sz / 8;
2476                 unsigned chan[4] = { 0 };
2477 
2478                 for (unsigned i = 0; i < comps; ++i)
2479                         chan[i] = instr->src[0].swizzle[i] * factor;
2480 
2481                 bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2482                 return;
2483         }
2484 
2485         case nir_op_b32csel:
2486         {
2487                 if (sz != 16)
2488                         break;
2489 
2490                 /* We allow vectorizing b32csel(cond, A, B) which can be
2491                  * translated as MUX.v2i16, even though cond is a 32-bit vector.
2492                  *
2493                  * If the source condition vector is replicated, we can use
2494                  * MUX.v2i16 directly, letting each component use the
2495                  * corresponding half of the 32-bit source. NIR uses 0/~0
2496                  * booleans so that's guaranteed to work (that is, 32-bit NIR
2497                  * booleans are 16-bit replicated).
2498                  *
2499                  * If we're not replicated, we use the same trick but must
2500                  * insert a MKVEC.v2i16 first to convert down to 16-bit.
2501                  */
2502                 bi_index idx = bi_src_index(&instr->src[0].src);
2503                 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2504                 bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2505                 bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2506 
2507                 if (!bi_nir_is_replicated(&instr->src[0])) {
2508                         s0 = bi_mkvec_v2i16(b, bi_half(s0, false),
2509                                             bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2510                 }
2511 
2512                 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2513                 return;
2514         }
2515 
2516         default:
2517                 break;
2518         }
2519 
2520         bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2521         bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2522         bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2523 
2524         switch (instr->op) {
2525         case nir_op_ffma:
2526                 bi_fma_to(b, sz, dst, s0, s1, s2);
2527                 break;
2528 
2529         case nir_op_fmul:
2530                 bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2531                 break;
2532 
2533         case nir_op_fsub:
2534                 s1 = bi_neg(s1);
2535                 FALLTHROUGH;
2536         case nir_op_fadd:
2537                 bi_fadd_to(b, sz, dst, s0, s1);
2538                 break;
2539 
2540         case nir_op_fsat: {
2541                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2542                 I->clamp = BI_CLAMP_CLAMP_0_1;
2543                 break;
2544         }
2545 
2546         case nir_op_fsat_signed_mali: {
2547                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2548                 I->clamp = BI_CLAMP_CLAMP_M1_1;
2549                 break;
2550         }
2551 
2552         case nir_op_fclamp_pos_mali: {
2553                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2554                 I->clamp = BI_CLAMP_CLAMP_0_INF;
2555                 break;
2556         }
2557 
2558         case nir_op_fneg:
2559                 bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2560                 break;
2561 
2562         case nir_op_fabs:
2563                 bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2564                 break;
2565 
2566         case nir_op_fsin:
2567                 bi_lower_fsincos_32(b, dst, s0, false);
2568                 break;
2569 
2570         case nir_op_fcos:
2571                 bi_lower_fsincos_32(b, dst, s0, true);
2572                 break;
2573 
2574         case nir_op_fexp2:
2575                 assert(sz == 32); /* should've been lowered */
2576 
2577                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2578                         bi_lower_fexp2_32(b, dst, s0);
2579                 else
2580                         bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2581 
2582                 break;
2583 
2584         case nir_op_flog2:
2585                 assert(sz == 32); /* should've been lowered */
2586 
2587                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2588                         bi_lower_flog2_32(b, dst, s0);
2589                 else
2590                         bi_flog2_32(b, dst, s0);
2591 
2592                 break;
2593 
2594         case nir_op_fpow:
2595                 assert(sz == 32); /* should've been lowered */
2596 
2597                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2598                         bi_lower_fpow_32(b, dst, s0, s1);
2599                 else
2600                         bi_fpow_32(b, dst, s0, s1);
2601 
2602                 break;
2603 
2604         case nir_op_frexp_exp:
2605                 bi_frexpe_to(b, sz, dst, s0, false, false);
2606                 break;
2607 
2608         case nir_op_frexp_sig:
2609                 bi_frexpm_to(b, sz, dst, s0, false, false);
2610                 break;
2611 
2612         case nir_op_ldexp:
2613                 bi_ldexp_to(b, sz, dst, s0, s1);
2614                 break;
2615 
2616         case nir_op_b8csel:
2617                 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2618                 break;
2619 
2620         case nir_op_b16csel:
2621                 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2622                 break;
2623 
2624         case nir_op_b32csel:
2625                 bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2626                 break;
2627 
2628         case nir_op_ishl:
2629                 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2630                 break;
2631         case nir_op_ushr:
2632                 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2633                 break;
2634 
2635         case nir_op_ishr:
2636                 if (b->shader->arch >= 9)
2637                         bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2638                 else
2639                         bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2640                 break;
2641 
2642         case nir_op_imin:
2643         case nir_op_umin:
2644                 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2645                                 s0, s1, s0, s1, BI_CMPF_LT);
2646                 break;
2647 
2648         case nir_op_imax:
2649         case nir_op_umax:
2650                 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2651                                 s0, s1, s0, s1, BI_CMPF_GT);
2652                 break;
2653 
2654         case nir_op_fddx_must_abs_mali:
2655         case nir_op_fddy_must_abs_mali: {
2656                 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2657                 bi_index adjacent = bi_clper_xor(b, s0, bit);
2658                 bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
2659                 break;
2660         }
2661 
2662         case nir_op_fddx:
2663         case nir_op_fddy:
2664         case nir_op_fddx_coarse:
2665         case nir_op_fddy_coarse:
2666         case nir_op_fddx_fine:
2667         case nir_op_fddy_fine: {
2668                 unsigned axis;
2669                 switch (instr->op) {
2670                 case nir_op_fddx:
2671                 case nir_op_fddx_coarse:
2672                 case nir_op_fddx_fine:
2673                         axis = 1;
2674                         break;
2675                 case nir_op_fddy:
2676                 case nir_op_fddy_coarse:
2677                 case nir_op_fddy_fine:
2678                         axis = 2;
2679                         break;
2680                 default:
2681                         unreachable("Invalid derivative op");
2682                 }
2683 
2684                 bi_index lane1, lane2;
2685                 switch (instr->op) {
2686                 case nir_op_fddx:
2687                 case nir_op_fddx_fine:
2688                 case nir_op_fddy:
2689                 case nir_op_fddy_fine:
2690                         lane1 = bi_lshift_and_i32(b,
2691                                 bi_fau(BIR_FAU_LANE_ID, false),
2692                                 bi_imm_u32(0x3 & ~axis),
2693                                 bi_imm_u8(0));
2694 
2695                         lane2 = bi_iadd_u32(b, lane1,
2696                                 bi_imm_u32(axis),
2697                                 false);
2698                         break;
2699                 case nir_op_fddx_coarse:
2700                 case nir_op_fddy_coarse:
2701                         lane1 = bi_imm_u32(0);
2702                         lane2 = bi_imm_u32(axis);
2703                         break;
2704                 default:
2705                         unreachable("Invalid derivative op");
2706                 }
2707 
2708                 bi_index left, right;
2709 
2710                 if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2711                         left = bi_clper_old_i32(b, s0, lane1);
2712                         right = bi_clper_old_i32(b, s0, lane2);
2713                 } else {
2714                         left = bi_clper_i32(b, s0, lane1,
2715                                         BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2716                                         BI_SUBGROUP_SUBGROUP4);
2717 
2718                         right = bi_clper_i32(b, s0, lane2,
2719                                         BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2720                                         BI_SUBGROUP_SUBGROUP4);
2721                 }
2722 
2723                 bi_fadd_to(b, sz, dst, right, bi_neg(left));
2724                 break;
2725         }
2726 
2727         case nir_op_f2f32:
2728                 bi_f16_to_f32_to(b, dst, s0);
2729                 break;
2730 
2731         case nir_op_fquantize2f16:
2732         {
2733                 bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2734                 bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2735 
2736                 f16->ftz = f32->ftz = true;
2737                 break;
2738         }
2739 
2740         case nir_op_f2i32:
2741                 if (src_sz == 32)
2742                         bi_f32_to_s32_to(b, dst, s0);
2743                 else
2744                         bi_f16_to_s32_to(b, dst, s0);
2745                 break;
2746 
2747         /* Note 32-bit sources => no vectorization, so 32-bit works */
2748         case nir_op_f2u16:
2749                 if (src_sz == 32)
2750                         bi_f32_to_u32_to(b, dst, s0);
2751                 else
2752                         bi_v2f16_to_v2u16_to(b, dst, s0);
2753                 break;
2754 
2755         case nir_op_f2i16:
2756                 if (src_sz == 32)
2757                         bi_f32_to_s32_to(b, dst, s0);
2758                 else
2759                         bi_v2f16_to_v2s16_to(b, dst, s0);
2760                 break;
2761 
2762         case nir_op_f2u32:
2763                 if (src_sz == 32)
2764                         bi_f32_to_u32_to(b, dst, s0);
2765                 else
2766                         bi_f16_to_u32_to(b, dst, s0);
2767                 break;
2768 
2769         case nir_op_u2f16:
2770                 if (src_sz == 32)
2771                         bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2772                 else if (src_sz == 16)
2773                         bi_v2u16_to_v2f16_to(b, dst, s0);
2774                 else if (src_sz == 8)
2775                         bi_v2u8_to_v2f16_to(b, dst, s0);
2776                 break;
2777 
2778         case nir_op_u2f32:
2779                 if (src_sz == 32)
2780                         bi_u32_to_f32_to(b, dst, s0);
2781                 else if (src_sz == 16)
2782                         bi_u16_to_f32_to(b, dst, s0);
2783                 else
2784                         bi_u8_to_f32_to(b, dst, s0);
2785                 break;
2786 
2787         case nir_op_i2f16:
2788                 if (src_sz == 32)
2789                         bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2790                 else if (src_sz == 16)
2791                         bi_v2s16_to_v2f16_to(b, dst, s0);
2792                 else if (src_sz == 8)
2793                         bi_v2s8_to_v2f16_to(b, dst, s0);
2794                 break;
2795 
2796         case nir_op_i2f32:
2797                 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2798 
2799                 if (src_sz == 32)
2800                         bi_s32_to_f32_to(b, dst, s0);
2801                 else if (src_sz == 16)
2802                         bi_s16_to_f32_to(b, dst, s0);
2803                 else if (src_sz == 8)
2804                         bi_s8_to_f32_to(b, dst, s0);
2805                 break;
2806 
2807         case nir_op_i2i32:
2808                 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2809 
2810                 if (src_sz == 32)
2811                         bi_mov_i32_to(b, dst, s0);
2812                 else if (src_sz == 16)
2813                         bi_s16_to_s32_to(b, dst, s0);
2814                 else if (src_sz == 8)
2815                         bi_s8_to_s32_to(b, dst, s0);
2816                 break;
2817 
2818         case nir_op_u2u32:
2819                 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2820 
2821                 if (src_sz == 32)
2822                         bi_mov_i32_to(b, dst, s0);
2823                 else if (src_sz == 16)
2824                         bi_u16_to_u32_to(b, dst, s0);
2825                 else if (src_sz == 8)
2826                         bi_u8_to_u32_to(b, dst, s0);
2827 
2828                 break;
2829 
2830         case nir_op_i2i16:
2831                 assert(src_sz == 8 || src_sz == 32);
2832 
2833                 if (src_sz == 8)
2834                         bi_v2s8_to_v2s16_to(b, dst, s0);
2835                 else
2836                         bi_mov_i32_to(b, dst, s0);
2837                 break;
2838 
2839         case nir_op_u2u16:
2840                 assert(src_sz == 8 || src_sz == 32);
2841 
2842                 if (src_sz == 8)
2843                         bi_v2u8_to_v2u16_to(b, dst, s0);
2844                 else
2845                         bi_mov_i32_to(b, dst, s0);
2846                 break;
2847 
2848         case nir_op_b2i8:
2849         case nir_op_b2i16:
2850         case nir_op_b2i32:
2851                 bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO);
2852                 break;
2853 
2854         case nir_op_f2b16:
2855                 bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO);
2856                 break;
2857         case nir_op_f2b32:
2858                 bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO);
2859                 break;
2860 
2861         case nir_op_i2b8:
2862                 bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO);
2863                 break;
2864         case nir_op_i2b16:
2865                 bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO);
2866                 break;
2867         case nir_op_i2b32:
2868                 bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO);
2869                 break;
2870 
2871         case nir_op_ieq8:
2872         case nir_op_ine8:
2873         case nir_op_ilt8:
2874         case nir_op_ige8:
2875         case nir_op_ieq16:
2876         case nir_op_ine16:
2877         case nir_op_ilt16:
2878         case nir_op_ige16:
2879         case nir_op_ieq32:
2880         case nir_op_ine32:
2881         case nir_op_ilt32:
2882         case nir_op_ige32:
2883                 bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2884                 break;
2885 
2886         case nir_op_ult8:
2887         case nir_op_uge8:
2888         case nir_op_ult16:
2889         case nir_op_uge16:
2890         case nir_op_ult32:
2891         case nir_op_uge32:
2892                 bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2893                 break;
2894 
2895         case nir_op_feq32:
2896         case nir_op_feq16:
2897         case nir_op_flt32:
2898         case nir_op_flt16:
2899         case nir_op_fge32:
2900         case nir_op_fge16:
2901         case nir_op_fneu32:
2902         case nir_op_fneu16:
2903                 bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2904                 break;
2905 
2906         case nir_op_fround_even:
2907         case nir_op_fceil:
2908         case nir_op_ffloor:
2909         case nir_op_ftrunc:
2910                 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2911                 break;
2912 
2913         case nir_op_fmin:
2914                 bi_fmin_to(b, sz, dst, s0, s1);
2915                 break;
2916 
2917         case nir_op_fmax:
2918                 bi_fmax_to(b, sz, dst, s0, s1);
2919                 break;
2920 
2921         case nir_op_iadd:
2922                 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2923                 break;
2924 
2925         case nir_op_iadd_sat:
2926                 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2927                 break;
2928 
2929         case nir_op_uadd_sat:
2930                 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2931                 break;
2932 
2933         case nir_op_ihadd:
2934                 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2935                 break;
2936 
2937         case nir_op_irhadd:
2938                 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2939                 break;
2940 
2941         case nir_op_ineg:
2942                 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2943                 break;
2944 
2945         case nir_op_isub:
2946                 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2947                 break;
2948 
2949         case nir_op_isub_sat:
2950                 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2951                 break;
2952 
2953         case nir_op_usub_sat:
2954                 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2955                 break;
2956 
2957         case nir_op_imul:
2958                 bi_imul_to(b, sz, dst, s0, s1);
2959                 break;
2960 
2961         case nir_op_iabs:
2962                 bi_iabs_to(b, sz, dst, s0);
2963                 break;
2964 
2965         case nir_op_iand:
2966                 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2967                 break;
2968 
2969         case nir_op_ior:
2970                 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2971                 break;
2972 
2973         case nir_op_ixor:
2974                 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2975                 break;
2976 
2977         case nir_op_inot:
2978                 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2979                 break;
2980 
2981         case nir_op_frsq:
2982                 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2983                         bi_lower_frsq_32(b, dst, s0);
2984                 else
2985                         bi_frsq_to(b, sz, dst, s0);
2986                 break;
2987 
2988         case nir_op_frcp:
2989                 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2990                         bi_lower_frcp_32(b, dst, s0);
2991                 else
2992                         bi_frcp_to(b, sz, dst, s0);
2993                 break;
2994 
2995         case nir_op_uclz:
2996                 bi_clz_to(b, sz, dst, s0, false);
2997                 break;
2998 
2999         case nir_op_bit_count:
3000                 bi_popcount_i32_to(b, dst, s0);
3001                 break;
3002 
3003         case nir_op_bitfield_reverse:
3004                 bi_bitrev_i32_to(b, dst, s0);
3005                 break;
3006 
3007         case nir_op_ufind_msb: {
3008                 bi_index clz = bi_clz(b, src_sz, s0, false);
3009 
3010                 if (sz == 8)
3011                         clz = bi_byte(clz, 0);
3012                 else if (sz == 16)
3013                         clz = bi_half(clz, false);
3014 
3015                 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3016                 break;
3017         }
3018 
3019         default:
3020                 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3021                 unreachable("Unknown ALU op");
3022         }
3023 }
3024 
3025 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
3026 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)3027 bifrost_tex_format(enum glsl_sampler_dim dim)
3028 {
3029         switch (dim) {
3030         case GLSL_SAMPLER_DIM_1D:
3031         case GLSL_SAMPLER_DIM_BUF:
3032                 return 1;
3033 
3034         case GLSL_SAMPLER_DIM_2D:
3035         case GLSL_SAMPLER_DIM_MS:
3036         case GLSL_SAMPLER_DIM_EXTERNAL:
3037         case GLSL_SAMPLER_DIM_RECT:
3038                 return 2;
3039 
3040         case GLSL_SAMPLER_DIM_3D:
3041                 return 3;
3042 
3043         case GLSL_SAMPLER_DIM_CUBE:
3044                 return 0;
3045 
3046         default:
3047                 DBG("Unknown sampler dim type\n");
3048                 assert(0);
3049                 return 0;
3050         }
3051 }
3052 
3053 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)3054 valhall_tex_dimension(enum glsl_sampler_dim dim)
3055 {
3056         switch (dim) {
3057         case GLSL_SAMPLER_DIM_1D:
3058         case GLSL_SAMPLER_DIM_BUF:
3059                 return BI_DIMENSION_1D;
3060 
3061         case GLSL_SAMPLER_DIM_2D:
3062         case GLSL_SAMPLER_DIM_MS:
3063         case GLSL_SAMPLER_DIM_EXTERNAL:
3064         case GLSL_SAMPLER_DIM_RECT:
3065                 return BI_DIMENSION_2D;
3066 
3067         case GLSL_SAMPLER_DIM_3D:
3068                 return BI_DIMENSION_3D;
3069 
3070         case GLSL_SAMPLER_DIM_CUBE:
3071                 return BI_DIMENSION_CUBE;
3072 
3073         default:
3074                 unreachable("Unknown sampler dim type");
3075         }
3076 }
3077 
3078 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)3079 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3080 {
3081         switch (T) {
3082         case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3083         case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3084         case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
3085         case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
3086         case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
3087         case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
3088         default:              unreachable("Invalid type for texturing");
3089         }
3090 }
3091 
3092 /* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
3093 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)3094 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3095 {
3096         /* For (u)int we can just passthrough */
3097         nir_alu_type base = nir_alu_type_get_base_type(T);
3098         if (base == nir_type_int || base == nir_type_uint)
3099                 return idx;
3100 
3101         /* Otherwise we convert */
3102         assert(T == nir_type_float32);
3103 
3104         /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3105          * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3106          * 0, dt - 1). So we use round RTE, clamping is handled at the data
3107          * structure level */
3108 
3109         bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3110         I->round = BI_ROUND_NONE;
3111         return I->dest[0];
3112 }
3113 
3114 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3115  * 16-bit 8:8 fixed-point format. We lower as:
3116  *
3117  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3118  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3119  */
3120 
3121 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)3122 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3123 {
3124         /* Precompute for constant LODs to avoid general constant folding */
3125         if (lod.type == BI_INDEX_CONSTANT) {
3126                 uint32_t raw = lod.value;
3127                 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3128                 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3129                 return bi_imm_u32(s32 & 0xFFFF);
3130         }
3131 
3132         /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3133          * the max LOD (16 since we cap at 2^16 texture dimensions), and
3134          * preferably small to minimize precision loss */
3135         const float max_lod = 16.0;
3136 
3137         bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
3138                         fp16 ? bi_half(lod, false) : lod,
3139                         bi_imm_f32(1.0f / max_lod), bi_negzero());
3140 
3141         fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3142 
3143         bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
3144                         bi_negzero());
3145 
3146         return bi_mkvec_v2i16(b,
3147                         bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0));
3148 }
3149 
3150 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3151  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3152  * TODO: Cube face.
3153  */
3154 
3155 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3156 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3157 {
3158         return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3159 }
3160 
3161 /* The hardware specifies texel offsets and multisample indices together as a
3162  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3163  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3164  * the bits we need and return that to be passed as a staging register. Else we
3165  * return 0 to avoid allocating a data register when everything is zero. */
3166 
3167 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3168 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3169 {
3170         bi_index dest = bi_zero();
3171 
3172         int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3173         if (offs_idx >= 0 &&
3174             (!nir_src_is_const(instr->src[offs_idx].src) ||
3175              nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3176                 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3177                 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3178                 dest = bi_mkvec_v4i8(b,
3179                                 (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3180                                 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3181                                 (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3182                                 bi_imm_u8(0));
3183         }
3184 
3185         int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3186         if (ms_idx >= 0 &&
3187             (!nir_src_is_const(instr->src[ms_idx].src) ||
3188              nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3189                 dest = bi_lshift_or_i32(b,
3190                                 bi_src_index(&instr->src[ms_idx].src), dest,
3191                                 bi_imm_u8(24));
3192         }
3193 
3194         return dest;
3195 }
3196 
3197 /*
3198  * Valhall specifies specifies texel offsets, multisample indices, and (for
3199  * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3200  * component is either offset.z or multisample index depending on context. Build
3201  * this register.
3202  */
3203 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3204 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3205 {
3206         bi_index dest = bi_zero();
3207 
3208         int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3209         int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3210         int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3211 
3212         /* Components 0-2: offsets */
3213         if (offs_idx >= 0 &&
3214             (!nir_src_is_const(instr->src[offs_idx].src) ||
3215              nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3216                 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3217                 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3218 
3219                 /* No multisample index with 3D */
3220                 assert((nr <= 2) || (ms_idx < 0));
3221 
3222                 /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3223                 bi_index z = (nr > 2) ?
3224                              bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3225                                               bi_imm_u8(0), bi_zero()) :
3226                              bi_zero();
3227 
3228                 dest = bi_mkvec_v2i8(b,
3229                                 (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3230                                 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3231                                 z);
3232         }
3233 
3234         /* Component 2: multisample index */
3235         if (ms_idx >= 0 &&
3236             (!nir_src_is_const(instr->src[ms_idx].src) ||
3237              nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3238                 dest = bi_mkvec_v2i16(b, dest,
3239                                 bi_src_index(&instr->src[ms_idx].src));
3240         }
3241 
3242         /* Component 3: 8-bit LOD */
3243         if (lod_idx >= 0 &&
3244             (!nir_src_is_const(instr->src[lod_idx].src) ||
3245              nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3246             nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3247                 dest = bi_lshift_or_i32(b,
3248                                 bi_src_index(&instr->src[lod_idx].src), dest,
3249                                 bi_imm_u8(24));
3250         }
3251 
3252         return dest;
3253 }
3254 
3255 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3256 bi_emit_cube_coord(bi_builder *b, bi_index coord,
3257                     bi_index *face, bi_index *s, bi_index *t)
3258 {
3259         /* Compute max { |x|, |y|, |z| } */
3260         bi_index maxxyz = bi_temp(b->shader);
3261         *face = bi_temp(b->shader);
3262 
3263         bi_index cx = bi_extract(b, coord, 0),
3264                  cy = bi_extract(b, coord, 1),
3265                  cz = bi_extract(b, coord, 2);
3266 
3267         /* Use a pseudo op on Bifrost due to tuple restrictions */
3268         if (b->shader->arch <= 8) {
3269                 bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3270         } else {
3271                 bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3272                 bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3273         }
3274 
3275         /* Select coordinates */
3276         bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3277         bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2),
3278                         *face);
3279 
3280         /* The OpenGL ES specification requires us to transform an input vector
3281          * (x, y, z) to the coordinate, given the selected S/T:
3282          *
3283          * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3284          *
3285          * We implement (s shown, t similar) in a form friendlier to FMA
3286          * instructions, and clamp coordinates at the end for correct
3287          * NaN/infinity handling:
3288          *
3289          * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3290          *
3291          * Take the reciprocal of max{x, y, z}
3292          */
3293         bi_index rcp = bi_frcp_f32(b, maxxyz);
3294 
3295         /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3296         bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3297 
3298         /* Transform the coordinates */
3299         *s = bi_temp(b->shader);
3300         *t = bi_temp(b->shader);
3301 
3302         bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3303         bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3304 
3305         S->clamp = BI_CLAMP_CLAMP_0_1;
3306         T->clamp = BI_CLAMP_CLAMP_0_1;
3307 }
3308 
3309 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3310  * 32-bits in passed pointer t. The packing of the face with the S coordinate
3311  * exploits the redundancy of floating points with the range restriction of
3312  * CUBEFACE output.
3313  *
3314  *     struct cube_map_descriptor {
3315  *         float s : 29;
3316  *         unsigned face : 3;
3317  *         float t : 32;
3318  *     }
3319  *
3320  * Since the cube face index is preshifted, this is easy to pack with a bitwise
3321  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3322  * bits from face.
3323  */
3324 
3325 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3326 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3327 {
3328         bi_index face, s;
3329         bi_emit_cube_coord(b, coord, &face, &s, t);
3330         bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3331         return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3332 }
3333 
3334 /* Map to the main texture op used. Some of these (txd in particular) will
3335  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3336  * sequence). We assume that lowering is handled elsewhere.
3337  */
3338 
3339 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3340 bi_tex_op(nir_texop op)
3341 {
3342         switch (op) {
3343         case nir_texop_tex:
3344         case nir_texop_txb:
3345         case nir_texop_txl:
3346         case nir_texop_txd:
3347         case nir_texop_tex_prefetch:
3348                 return BIFROST_TEX_OP_TEX;
3349         case nir_texop_txf:
3350         case nir_texop_txf_ms:
3351         case nir_texop_txf_ms_fb:
3352         case nir_texop_tg4:
3353                 return BIFROST_TEX_OP_FETCH;
3354         case nir_texop_txs:
3355         case nir_texop_lod:
3356         case nir_texop_query_levels:
3357         case nir_texop_texture_samples:
3358         case nir_texop_samples_identical:
3359                 unreachable("should've been lowered");
3360         default:
3361                 unreachable("unsupported tex op");
3362         }
3363 }
3364 
3365 /* Data registers required by texturing in the order they appear. All are
3366  * optional, the texture operation descriptor determines which are present.
3367  * Note since 3D arrays are not permitted at an API level, Z_COORD and
3368  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3369 
3370 enum bifrost_tex_dreg {
3371         BIFROST_TEX_DREG_Z_COORD = 0,
3372         BIFROST_TEX_DREG_Y_DELTAS = 1,
3373         BIFROST_TEX_DREG_LOD = 2,
3374         BIFROST_TEX_DREG_GRDESC_HI = 3,
3375         BIFROST_TEX_DREG_SHADOW = 4,
3376         BIFROST_TEX_DREG_ARRAY = 5,
3377         BIFROST_TEX_DREG_OFFSETMS = 6,
3378         BIFROST_TEX_DREG_SAMPLER = 7,
3379         BIFROST_TEX_DREG_TEXTURE = 8,
3380         BIFROST_TEX_DREG_COUNT,
3381 };
3382 
3383 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3384 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3385 {
3386         struct bifrost_texture_operation desc = {
3387                 .op = bi_tex_op(instr->op),
3388                 .offset_or_bias_disable = false, /* TODO */
3389                 .shadow_or_clamp_disable = instr->is_shadow,
3390                 .array = instr->is_array,
3391                 .dimension = bifrost_tex_format(instr->sampler_dim),
3392                 .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
3393                 .mask = 0xF,
3394         };
3395 
3396         switch (desc.op) {
3397         case BIFROST_TEX_OP_TEX:
3398                 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3399                 break;
3400         case BIFROST_TEX_OP_FETCH:
3401                 desc.lod_or_fetch = (enum bifrost_lod_mode)
3402                    (instr->op == nir_texop_tg4 ?
3403                         BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
3404                         BIFROST_TEXTURE_FETCH_TEXEL);
3405                 break;
3406         default:
3407                 unreachable("texture op unsupported");
3408         }
3409 
3410         /* 32-bit indices to be allocated as consecutive staging registers */
3411         bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
3412         bi_index cx = bi_null(), cy = bi_null();
3413 
3414         for (unsigned i = 0; i < instr->num_srcs; ++i) {
3415                 bi_index index = bi_src_index(&instr->src[i].src);
3416                 unsigned sz = nir_src_bit_size(instr->src[i].src);
3417                 unsigned components = nir_src_num_components(instr->src[i].src);
3418                 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3419                 nir_alu_type T = base | sz;
3420 
3421                 switch (instr->src[i].src_type) {
3422                 case nir_tex_src_coord:
3423                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3424                                 cx = bi_emit_texc_cube_coord(b, index, &cy);
3425 			} else {
3426                                 /* Copy XY (for 2D+) or XX (for 1D) */
3427                                 cx = bi_extract(b, index, 0);
3428                                 cy = bi_extract(b, index, MIN2(1, components - 1));
3429 
3430                                 assert(components >= 1 && components <= 3);
3431 
3432                                 if (components == 3 && !desc.array) {
3433                                         /* 3D */
3434                                         dregs[BIFROST_TEX_DREG_Z_COORD] =
3435                                                 bi_extract(b, index, 2);
3436                                 }
3437                         }
3438 
3439                         if (desc.array) {
3440                                 dregs[BIFROST_TEX_DREG_ARRAY] =
3441                                                 bi_emit_texc_array_index(b,
3442                                                                 bi_extract(b, index, components - 1), T);
3443                         }
3444 
3445                         break;
3446 
3447                 case nir_tex_src_lod:
3448                         if (desc.op == BIFROST_TEX_OP_TEX &&
3449                             nir_src_is_const(instr->src[i].src) &&
3450                             nir_src_as_uint(instr->src[i].src) == 0) {
3451                                 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3452                         } else if (desc.op == BIFROST_TEX_OP_TEX) {
3453                                 assert(base == nir_type_float);
3454 
3455                                 assert(sz == 16 || sz == 32);
3456                                 dregs[BIFROST_TEX_DREG_LOD] =
3457                                         bi_emit_texc_lod_88(b, index, sz == 16);
3458                                 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3459                         } else {
3460                                 assert(desc.op == BIFROST_TEX_OP_FETCH);
3461                                 assert(base == nir_type_uint || base == nir_type_int);
3462                                 assert(sz == 16 || sz == 32);
3463 
3464                                 dregs[BIFROST_TEX_DREG_LOD] =
3465                                         bi_emit_texc_lod_cube(b, index);
3466                         }
3467 
3468                         break;
3469 
3470                 case nir_tex_src_bias:
3471                         /* Upper 16-bits interpreted as a clamp, leave zero */
3472                         assert(desc.op == BIFROST_TEX_OP_TEX);
3473                         assert(base == nir_type_float);
3474                         assert(sz == 16 || sz == 32);
3475                         dregs[BIFROST_TEX_DREG_LOD] =
3476                                 bi_emit_texc_lod_88(b, index, sz == 16);
3477                         desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3478                         break;
3479 
3480                 case nir_tex_src_ms_index:
3481                 case nir_tex_src_offset:
3482                         if (desc.offset_or_bias_disable)
3483                                 break;
3484 
3485                         dregs[BIFROST_TEX_DREG_OFFSETMS] =
3486 	                        bi_emit_texc_offset_ms_index(b, instr);
3487                         if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3488                                 desc.offset_or_bias_disable = true;
3489                         break;
3490 
3491                 case nir_tex_src_comparator:
3492                         dregs[BIFROST_TEX_DREG_SHADOW] = index;
3493                         break;
3494 
3495                 case nir_tex_src_texture_offset:
3496                         if (instr->texture_index)
3497                                 index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false);
3498 
3499                         dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3500 
3501                         break;
3502 
3503                 case nir_tex_src_sampler_offset:
3504                         if (instr->sampler_index)
3505                                 index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false);
3506 
3507                         dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3508                         break;
3509 
3510                 default:
3511                         unreachable("Unhandled src type in texc emit");
3512                 }
3513         }
3514 
3515         if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3516                 dregs[BIFROST_TEX_DREG_LOD] =
3517                         bi_emit_texc_lod_cube(b, bi_zero());
3518         }
3519 
3520         /* Choose an index mode */
3521 
3522         bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3523         bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3524         bool direct = direct_tex && direct_samp;
3525 
3526         desc.immediate_indices = direct && (instr->sampler_index < 16);
3527 
3528         if (desc.immediate_indices) {
3529                 desc.sampler_index_or_mode = instr->sampler_index;
3530                 desc.index = instr->texture_index;
3531         } else {
3532                 unsigned mode = 0;
3533 
3534                 if (direct && instr->sampler_index == instr->texture_index) {
3535                         mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3536                         desc.index = instr->texture_index;
3537                 } else if (direct) {
3538                         mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3539                         desc.index = instr->sampler_index;
3540                         dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
3541                                         bi_imm_u32(instr->texture_index));
3542                 } else if (direct_tex) {
3543                         assert(!direct_samp);
3544                         mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3545                         desc.index = instr->texture_index;
3546                 } else if (direct_samp) {
3547                         assert(!direct_tex);
3548                         mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3549                         desc.index = instr->sampler_index;
3550                 } else {
3551                         mode = BIFROST_INDEX_REGISTER;
3552                 }
3553 
3554                 mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3555                 desc.sampler_index_or_mode = mode;
3556         }
3557 
3558         /* Allocate staging registers contiguously by compacting the array. */
3559         unsigned sr_count = 0;
3560 
3561         for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3562                 if (!bi_is_null(dregs[i]))
3563                         dregs[sr_count++] = dregs[i];
3564         }
3565 
3566         unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
3567 
3568         bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3569         bi_index dst = bi_temp(b->shader);
3570 
3571         if (sr_count)
3572                 bi_emit_collect_to(b, sr, dregs, sr_count);
3573 
3574         uint32_t desc_u = 0;
3575         memcpy(&desc_u, &desc, sizeof(desc_u));
3576         bi_instr *I =
3577                 bi_texc_to(b, dst, bi_null(), sr, cx, cy,
3578                    bi_imm_u32(desc_u),
3579                    !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3580         I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3581 
3582         bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
3583         bi_emit_split_i32(b, w, dst, res_size);
3584         bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
3585                         DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
3586 }
3587 
3588 /* Staging registers required by texturing in the order they appear (Valhall) */
3589 
3590 enum valhall_tex_sreg {
3591         VALHALL_TEX_SREG_X_COORD = 0,
3592         VALHALL_TEX_SREG_Y_COORD = 1,
3593         VALHALL_TEX_SREG_Z_COORD = 2,
3594         VALHALL_TEX_SREG_Y_DELTAS = 3,
3595         VALHALL_TEX_SREG_ARRAY = 4,
3596         VALHALL_TEX_SREG_SHADOW = 5,
3597         VALHALL_TEX_SREG_OFFSETMS = 6,
3598         VALHALL_TEX_SREG_LOD = 7,
3599         VALHALL_TEX_SREG_GRDESC = 8,
3600         VALHALL_TEX_SREG_COUNT,
3601 };
3602 
3603 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3604 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3605 {
3606         bool explicit_offset = false;
3607         enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3608 
3609         bool has_lod_mode =
3610                 (instr->op == nir_texop_tex) ||
3611                 (instr->op == nir_texop_txl) ||
3612                 (instr->op == nir_texop_txb);
3613 
3614         /* 32-bit indices to be allocated as consecutive staging registers */
3615         bi_index sregs[VALHALL_TEX_SREG_COUNT] = { };
3616 
3617         bi_index sampler = bi_imm_u32(instr->sampler_index);
3618         bi_index texture = bi_imm_u32(instr->texture_index);
3619         uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27);
3620 
3621         for (unsigned i = 0; i < instr->num_srcs; ++i) {
3622                 bi_index index = bi_src_index(&instr->src[i].src);
3623                 unsigned sz = nir_src_bit_size(instr->src[i].src);
3624                 unsigned components = nir_src_num_components(instr->src[i].src);
3625 
3626                 switch (instr->src[i].src_type) {
3627                 case nir_tex_src_coord:
3628                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3629                                 sregs[VALHALL_TEX_SREG_X_COORD] =
3630                                         bi_emit_texc_cube_coord(b, index,
3631                                                 &sregs[VALHALL_TEX_SREG_Y_COORD]);
3632 			} else {
3633                                 assert(components >= 1 && components <= 3);
3634 
3635                                 /* Copy XY (for 2D+) or XX (for 1D) */
3636                                 sregs[VALHALL_TEX_SREG_X_COORD] = index;
3637 
3638                                 if (components >= 2)
3639                                         sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3640 
3641                                 if (components == 3 && !instr->is_array) {
3642                                         sregs[VALHALL_TEX_SREG_Z_COORD] =
3643                                                 bi_extract(b, index, 2);
3644                                 }
3645                         }
3646 
3647                         if (instr->is_array) {
3648                                 sregs[VALHALL_TEX_SREG_ARRAY] =
3649                                         bi_extract(b, index, components - 1);
3650                         }
3651 
3652                         break;
3653 
3654                 case nir_tex_src_lod:
3655                         if (nir_src_is_const(instr->src[i].src) &&
3656                             nir_src_as_uint(instr->src[i].src) == 0) {
3657                                 lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3658                         } else if (has_lod_mode) {
3659                                 lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3660 
3661                                 assert(sz == 16 || sz == 32);
3662                                 sregs[VALHALL_TEX_SREG_LOD] =
3663                                         bi_emit_texc_lod_88(b, index, sz == 16);
3664                         }
3665                         break;
3666 
3667                 case nir_tex_src_bias:
3668                         /* Upper 16-bits interpreted as a clamp, leave zero */
3669                         assert(sz == 16 || sz == 32);
3670                         sregs[VALHALL_TEX_SREG_LOD] =
3671                                 bi_emit_texc_lod_88(b, index, sz == 16);
3672 
3673                         lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3674                         break;
3675                 case nir_tex_src_ms_index:
3676                 case nir_tex_src_offset:
3677                         /* Handled below */
3678                         break;
3679 
3680                 case nir_tex_src_comparator:
3681                         sregs[VALHALL_TEX_SREG_SHADOW] = index;
3682                         break;
3683 
3684                 case nir_tex_src_texture_offset:
3685                         assert(instr->texture_index == 0);
3686                         texture = index;
3687                         break;
3688 
3689                 case nir_tex_src_sampler_offset:
3690                         assert(instr->sampler_index == 0);
3691                         sampler = index;
3692                         break;
3693 
3694                 default:
3695                         unreachable("Unhandled src type in tex emit");
3696                 }
3697         }
3698 
3699         /* Generate packed offset + ms index + LOD register. These default to
3700          * zero so we only need to encode if these features are actually in use.
3701          */
3702         bi_index offsets = bi_emit_valhall_offsets(b, instr);
3703 
3704         if (!bi_is_equiv(offsets, bi_zero())) {
3705                 sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3706                 explicit_offset = true;
3707         }
3708 
3709         /* Allocate staging registers contiguously by compacting the array. */
3710         unsigned sr_count = 0;
3711 
3712         for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3713                 if (!bi_is_null(sregs[i]))
3714                         sregs[sr_count++] = sregs[i];
3715         }
3716 
3717         bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3718 
3719         if (sr_count)
3720                 bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3721 
3722         bi_index image_src = bi_imm_u32(tables);
3723         image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
3724         image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
3725 
3726         unsigned mask = BI_WRITE_MASK_RGBA;
3727         unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4;
3728         enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3729         enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3730         bi_index dest = bi_temp(b->shader);
3731 
3732         switch (instr->op) {
3733         case nir_texop_tex:
3734         case nir_texop_txl:
3735         case nir_texop_txb:
3736                 bi_tex_single_to(b, dest, idx, image_src, bi_zero(),
3737                                  instr->is_array, dim, regfmt, instr->is_shadow,
3738                                  explicit_offset, lod_mode, mask, sr_count);
3739                 break;
3740         case nir_texop_txf:
3741         case nir_texop_txf_ms:
3742                 bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(),
3743                                 instr->is_array, dim, regfmt, explicit_offset,
3744                                 mask, sr_count);
3745                 break;
3746         case nir_texop_tg4:
3747                 bi_tex_gather_to(b, dest, idx, image_src, bi_zero(),
3748                                  instr->is_array, dim, instr->component, false,
3749                                  regfmt, instr->is_shadow, explicit_offset,
3750                                  mask, sr_count);
3751                 break;
3752         default:
3753                 unreachable("Unhandled Valhall texture op");
3754         }
3755 
3756         bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
3757         bi_emit_split_i32(b, w, dest, res_size);
3758         bi_emit_collect_to(b, bi_dest_index(&instr->dest), w,
3759                         DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4));
3760 }
3761 
3762 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3763  * textures with sufficiently small immediate indices. Anything else
3764  * needs a complete texture op. */
3765 
3766 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)3767 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3768 {
3769         int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3770         assert(coord_idx >= 0);
3771         bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3772 
3773         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3774                 bi_index face, s, t;
3775                 bi_emit_cube_coord(b, coords, &face, &s, &t);
3776 
3777                 bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
3778                                 bi_dest_index(&instr->dest),
3779                                 s, t, face,
3780                                 instr->sampler_index, instr->texture_index);
3781         } else {
3782                 bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
3783                                 bi_dest_index(&instr->dest),
3784                                 bi_extract(b, coords, 0),
3785                                 bi_extract(b, coords, 1),
3786                                 instr->op != nir_texop_tex, /* zero LOD */
3787                                 instr->sampler_index, instr->texture_index);
3788         }
3789 
3790         bi_split_dest(b, instr->dest);
3791 }
3792 
3793 static bool
bi_is_simple_tex(nir_tex_instr * instr)3794 bi_is_simple_tex(nir_tex_instr *instr)
3795 {
3796         if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3797                 return false;
3798 
3799         if (instr->dest_type != nir_type_float32 &&
3800             instr->dest_type != nir_type_float16)
3801                 return false;
3802 
3803         if (instr->is_shadow || instr->is_array)
3804                 return false;
3805 
3806         switch (instr->sampler_dim) {
3807         case GLSL_SAMPLER_DIM_2D:
3808         case GLSL_SAMPLER_DIM_EXTERNAL:
3809         case GLSL_SAMPLER_DIM_RECT:
3810                 break;
3811 
3812         case GLSL_SAMPLER_DIM_CUBE:
3813                 /* LOD can't be specified with TEXS_CUBE */
3814                 if (instr->op == nir_texop_txl)
3815                         return false;
3816                 break;
3817 
3818         default:
3819                 return false;
3820         }
3821 
3822         for (unsigned i = 0; i < instr->num_srcs; ++i) {
3823                 if (instr->src[i].src_type != nir_tex_src_lod &&
3824                     instr->src[i].src_type != nir_tex_src_coord)
3825                         return false;
3826         }
3827 
3828         /* Indices need to fit in provided bits */
3829         unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3830         if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3831                 return false;
3832 
3833         int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3834         if (lod_idx < 0)
3835                 return true;
3836 
3837         nir_src lod = instr->src[lod_idx].src;
3838         return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3839 }
3840 
3841 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)3842 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3843 {
3844         switch (instr->op) {
3845         case nir_texop_txs:
3846                 bi_load_sysval_to(b, bi_dest_index(&instr->dest),
3847                                 panfrost_sysval_for_instr(&instr->instr, NULL),
3848                                 nir_dest_num_components(instr->dest), 0);
3849                 return;
3850         case nir_texop_tex:
3851         case nir_texop_txl:
3852         case nir_texop_txb:
3853         case nir_texop_txf:
3854         case nir_texop_txf_ms:
3855         case nir_texop_tg4:
3856                 break;
3857         default:
3858                 unreachable("Invalid texture operation");
3859         }
3860 
3861         if (b->shader->arch >= 9)
3862                 bi_emit_tex_valhall(b, instr);
3863         else if (bi_is_simple_tex(instr))
3864                 bi_emit_texs(b, instr);
3865         else
3866                 bi_emit_texc(b, instr);
3867 }
3868 
3869 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)3870 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
3871 {
3872         switch (instr->type) {
3873         case nir_instr_type_load_const:
3874                 bi_emit_load_const(b, nir_instr_as_load_const(instr));
3875                 break;
3876 
3877         case nir_instr_type_intrinsic:
3878                 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
3879                 break;
3880 
3881         case nir_instr_type_alu:
3882                 bi_emit_alu(b, nir_instr_as_alu(instr));
3883                 break;
3884 
3885         case nir_instr_type_tex:
3886                 bi_emit_tex(b, nir_instr_as_tex(instr));
3887                 break;
3888 
3889         case nir_instr_type_jump:
3890                 bi_emit_jump(b, nir_instr_as_jump(instr));
3891                 break;
3892 
3893         default:
3894                 unreachable("should've been lowered");
3895         }
3896 }
3897 
3898 static bi_block *
create_empty_block(bi_context * ctx)3899 create_empty_block(bi_context *ctx)
3900 {
3901         bi_block *blk = rzalloc(ctx, bi_block);
3902 
3903         util_dynarray_init(&blk->predecessors, blk);
3904 
3905         return blk;
3906 }
3907 
3908 static bi_block *
emit_block(bi_context * ctx,nir_block * block)3909 emit_block(bi_context *ctx, nir_block *block)
3910 {
3911         if (ctx->after_block) {
3912                 ctx->current_block = ctx->after_block;
3913                 ctx->after_block = NULL;
3914         } else {
3915                 ctx->current_block = create_empty_block(ctx);
3916         }
3917 
3918         list_addtail(&ctx->current_block->link, &ctx->blocks);
3919         list_inithead(&ctx->current_block->instructions);
3920 
3921         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3922 
3923         nir_foreach_instr(instr, block) {
3924                 bi_emit_instr(&_b, instr);
3925                 ++ctx->instruction_count;
3926         }
3927 
3928         return ctx->current_block;
3929 }
3930 
3931 static void
emit_if(bi_context * ctx,nir_if * nif)3932 emit_if(bi_context *ctx, nir_if *nif)
3933 {
3934         bi_block *before_block = ctx->current_block;
3935 
3936         /* Speculatively emit the branch, but we can't fill it in until later */
3937         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3938         bi_instr *then_branch = bi_branchz_i16(&_b,
3939                         bi_half(bi_src_index(&nif->condition), false),
3940                         bi_zero(), BI_CMPF_EQ);
3941 
3942         /* Emit the two subblocks. */
3943         bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
3944         bi_block *end_then_block = ctx->current_block;
3945 
3946         /* Emit second block, and check if it's empty */
3947 
3948         int count_in = ctx->instruction_count;
3949         bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
3950         bi_block *end_else_block = ctx->current_block;
3951         ctx->after_block = create_empty_block(ctx);
3952 
3953         /* Now that we have the subblocks emitted, fix up the branches */
3954 
3955         assert(then_block);
3956         assert(else_block);
3957 
3958         if (ctx->instruction_count == count_in) {
3959                 then_branch->branch_target = ctx->after_block;
3960                 bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */
3961         } else {
3962                 then_branch->branch_target = else_block;
3963 
3964                 /* Emit a jump from the end of the then block to the end of the else */
3965                 _b.cursor = bi_after_block(end_then_block);
3966                 bi_instr *then_exit = bi_jump(&_b, bi_zero());
3967                 then_exit->branch_target = ctx->after_block;
3968 
3969                 bi_block_add_successor(end_then_block, then_exit->branch_target);
3970                 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
3971         }
3972 
3973         bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
3974         bi_block_add_successor(before_block, then_block); /* fallthrough */
3975 }
3976 
3977 static void
emit_loop(bi_context * ctx,nir_loop * nloop)3978 emit_loop(bi_context *ctx, nir_loop *nloop)
3979 {
3980         /* Remember where we are */
3981         bi_block *start_block = ctx->current_block;
3982 
3983         bi_block *saved_break = ctx->break_block;
3984         bi_block *saved_continue = ctx->continue_block;
3985 
3986         ctx->continue_block = create_empty_block(ctx);
3987         ctx->break_block = create_empty_block(ctx);
3988         ctx->after_block = ctx->continue_block;
3989 
3990         /* Emit the body itself */
3991         emit_cf_list(ctx, &nloop->body);
3992 
3993         /* Branch back to loop back */
3994         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3995         bi_instr *I = bi_jump(&_b, bi_zero());
3996         I->branch_target = ctx->continue_block;
3997         bi_block_add_successor(start_block, ctx->continue_block);
3998         bi_block_add_successor(ctx->current_block, ctx->continue_block);
3999 
4000         ctx->after_block = ctx->break_block;
4001 
4002         /* Pop off */
4003         ctx->break_block = saved_break;
4004         ctx->continue_block = saved_continue;
4005         ++ctx->loop_count;
4006 }
4007 
4008 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)4009 emit_cf_list(bi_context *ctx, struct exec_list *list)
4010 {
4011         bi_block *start_block = NULL;
4012 
4013         foreach_list_typed(nir_cf_node, node, node, list) {
4014                 switch (node->type) {
4015                 case nir_cf_node_block: {
4016                         bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4017 
4018                         if (!start_block)
4019                                 start_block = block;
4020 
4021                         break;
4022                 }
4023 
4024                 case nir_cf_node_if:
4025                         emit_if(ctx, nir_cf_node_as_if(node));
4026                         break;
4027 
4028                 case nir_cf_node_loop:
4029                         emit_loop(ctx, nir_cf_node_as_loop(node));
4030                         break;
4031 
4032                 default:
4033                         unreachable("Unknown control flow");
4034                 }
4035         }
4036 
4037         return start_block;
4038 }
4039 
4040 /* shader-db stuff */
4041 
4042 struct bi_stats {
4043         unsigned nr_clauses, nr_tuples, nr_ins;
4044         unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4045 };
4046 
4047 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)4048 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4049 {
4050         /* Count instructions */
4051         stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4052 
4053         /* Non-message passing tuples are always arithmetic */
4054         if (tuple->add != clause->message) {
4055                 stats->nr_arith++;
4056                 return;
4057         }
4058 
4059         /* Message + FMA we'll count as arithmetic _and_ message */
4060         if (tuple->fma)
4061                 stats->nr_arith++;
4062 
4063         switch (clause->message_type) {
4064         case BIFROST_MESSAGE_VARYING:
4065                 /* Check components interpolated */
4066                 stats->nr_varying += (clause->message->vecsize + 1) *
4067                         (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4068                 break;
4069 
4070         case BIFROST_MESSAGE_VARTEX:
4071                 /* 2 coordinates, fp32 each */
4072                 stats->nr_varying += (2 * 2);
4073                 FALLTHROUGH;
4074         case BIFROST_MESSAGE_TEX:
4075                 stats->nr_texture++;
4076                 break;
4077 
4078         case BIFROST_MESSAGE_ATTRIBUTE:
4079         case BIFROST_MESSAGE_LOAD:
4080         case BIFROST_MESSAGE_STORE:
4081         case BIFROST_MESSAGE_ATOMIC:
4082                 stats->nr_ldst++;
4083                 break;
4084 
4085         case BIFROST_MESSAGE_NONE:
4086         case BIFROST_MESSAGE_BARRIER:
4087         case BIFROST_MESSAGE_BLEND:
4088         case BIFROST_MESSAGE_TILE:
4089         case BIFROST_MESSAGE_Z_STENCIL:
4090         case BIFROST_MESSAGE_ATEST:
4091         case BIFROST_MESSAGE_JOB:
4092         case BIFROST_MESSAGE_64BIT:
4093                 /* Nothing to do */
4094                 break;
4095         };
4096 
4097 }
4098 
4099 /*
4100  * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4101  * shader completes. These costs are not accounted for in the general cycle
4102  * counts, so this function calculates the effective cost of these messages, as
4103  * if they were executed by shader code.
4104  */
4105 static unsigned
bi_count_preload_cost(bi_context * ctx)4106 bi_count_preload_cost(bi_context *ctx)
4107 {
4108         /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4109          * 16 fp16 varying components per cycle or fetch two texels per cycle.
4110          */
4111         unsigned cost = 0;
4112 
4113         for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4114                 struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4115 
4116                 if (msg.enabled && msg.texture) {
4117                         /* 2 coordinate, 2 half-words each, plus texture */
4118                         cost += 12;
4119                 } else if (msg.enabled) {
4120                         cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4121                 }
4122         }
4123 
4124         return cost;
4125 }
4126 
4127 static const char *
bi_shader_stage_name(bi_context * ctx)4128 bi_shader_stage_name(bi_context *ctx)
4129 {
4130         if (ctx->idvs == BI_IDVS_VARYING)
4131                 return "MESA_SHADER_VARYING";
4132         else if (ctx->idvs == BI_IDVS_POSITION)
4133                 return "MESA_SHADER_POSITION";
4134         else if (ctx->inputs->is_blend)
4135                 return "MESA_SHADER_BLEND";
4136         else
4137                 return gl_shader_stage_name(ctx->stage);
4138 }
4139 
4140 static void
bi_print_stats(bi_context * ctx,unsigned size,FILE * fp)4141 bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
4142 {
4143         struct bi_stats stats = { 0 };
4144 
4145         /* Count instructions, clauses, and tuples. Also attempt to construct
4146          * normalized execution engine cycle counts, using the following ratio:
4147          *
4148          * 24 arith tuples/cycle
4149          * 2 texture messages/cycle
4150          * 16 x 16-bit varying channels interpolated/cycle
4151          * 1 load store message/cycle
4152          *
4153          * These numbers seem to match Arm Mobile Studio's heuristic. The real
4154          * cycle counts are surely more complicated.
4155          */
4156 
4157         bi_foreach_block(ctx, block) {
4158                 bi_foreach_clause_in_block(block, clause) {
4159                         stats.nr_clauses++;
4160                         stats.nr_tuples += clause->tuple_count;
4161 
4162                         for (unsigned i = 0; i < clause->tuple_count; ++i)
4163                                 bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4164                 }
4165         }
4166 
4167         float cycles_arith = ((float) stats.nr_arith) / 24.0;
4168         float cycles_texture = ((float) stats.nr_texture) / 2.0;
4169         float cycles_varying = ((float) stats.nr_varying) / 16.0;
4170         float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
4171 
4172         float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4173         float cycles_bound = MAX2(cycles_arith, cycles_message);
4174 
4175         /* Thread count and register pressure are traded off only on v7 */
4176         bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4177         unsigned nr_threads = full_threads ? 2 : 1;
4178 
4179         /* Dump stats */
4180         char *str = ralloc_asprintf(NULL, "%s - %s shader: "
4181                         "%u inst, %u tuples, %u clauses, "
4182                         "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4183                         "%u quadwords, %u threads",
4184                         ctx->nir->info.label ?: "",
4185                         bi_shader_stage_name(ctx),
4186                         stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
4187                         cycles_bound, cycles_arith, cycles_texture,
4188                         cycles_varying, cycles_ldst,
4189                         size / 16, nr_threads);
4190 
4191         if (ctx->arch == 7) {
4192                 ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4193         }
4194 
4195         ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills\n",
4196                         ctx->loop_count, ctx->spills, ctx->fills);
4197 
4198         fputs(str, stderr);
4199         ralloc_free(str);
4200 }
4201 
4202 static void
va_print_stats(bi_context * ctx,unsigned size,FILE * fp)4203 va_print_stats(bi_context *ctx, unsigned size, FILE *fp)
4204 {
4205         unsigned nr_ins = 0;
4206         struct va_stats stats = { 0 };
4207 
4208         /* Count instructions */
4209         bi_foreach_instr_global(ctx, I) {
4210                 nr_ins++;
4211                 va_count_instr_stats(I, &stats);
4212         }
4213 
4214         /* Mali G78 peak performance:
4215          *
4216          * 64 FMA instructions per cycle
4217          * 64 CVT instructions per cycle
4218          * 16 SFU instructions per cycle
4219          * 8 x 32-bit varying channels interpolated per cycle
4220          * 4 texture instructions per cycle
4221          * 1 load/store operation per cycle
4222          */
4223 
4224         float cycles_fma = ((float) stats.fma) / 64.0;
4225         float cycles_cvt = ((float) stats.cvt) / 64.0;
4226         float cycles_sfu = ((float) stats.sfu) / 16.0;
4227         float cycles_v = ((float) stats.v) / 16.0;
4228         float cycles_t = ((float) stats.t) / 4.0;
4229         float cycles_ls = ((float) stats.ls) / 1.0;
4230 
4231         /* Calculate the bound */
4232         float cycles = MAX2(
4233                         MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4234                         MAX3(cycles_v,   cycles_t,   cycles_ls));
4235 
4236 
4237         /* Thread count and register pressure are traded off */
4238         unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4239 
4240         /* Dump stats */
4241         fprintf(stderr, "%s - %s shader: "
4242                         "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4243                         "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4244                         "%u:%u spills:fills\n",
4245                         ctx->nir->info.label ?: "",
4246                         bi_shader_stage_name(ctx),
4247                         nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu,
4248                         cycles_v, cycles_t, cycles_ls, size / 16, nr_threads,
4249                         ctx->loop_count, ctx->spills, ctx->fills);
4250 }
4251 
4252 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4253 glsl_type_size(const struct glsl_type *type, bool bindless)
4254 {
4255         return glsl_count_attribute_slots(type, false);
4256 }
4257 
4258 /* Split stores to memory. We don't split stores to vertex outputs, since
4259  * nir_lower_io_to_temporaries will ensure there's only a single write.
4260  */
4261 
4262 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4263 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4264 {
4265         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4266 
4267         switch (intr->intrinsic) {
4268         case nir_intrinsic_store_ssbo:
4269         case nir_intrinsic_store_shared:
4270         case nir_intrinsic_store_global:
4271         case nir_intrinsic_store_scratch:
4272                 return true;
4273         default:
4274                 return false;
4275         }
4276 }
4277 
4278 /* Bifrost wants transcendentals as FP32 */
4279 
4280 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4281 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4282 {
4283         if (instr->type != nir_instr_type_alu)
4284                 return 0;
4285 
4286         nir_alu_instr *alu = nir_instr_as_alu(instr);
4287 
4288         switch (alu->op) {
4289         case nir_op_fexp2:
4290         case nir_op_flog2:
4291         case nir_op_fpow:
4292         case nir_op_fsin:
4293         case nir_op_fcos:
4294                 return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
4295         default:
4296                 return 0;
4297         }
4298 }
4299 
4300 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4301  * transcendentals are an exception. Also shifts because of lane size mismatch
4302  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4303  * to be scalarized due to type size. */
4304 
4305 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4306 bi_vectorize_filter(const nir_instr *instr, const void *data)
4307 {
4308         /* Defaults work for everything else */
4309         if (instr->type != nir_instr_type_alu)
4310                 return 0;
4311 
4312         const nir_alu_instr *alu = nir_instr_as_alu(instr);
4313 
4314         switch (alu->op) {
4315         case nir_op_frcp:
4316         case nir_op_frsq:
4317         case nir_op_ishl:
4318         case nir_op_ishr:
4319         case nir_op_ushr:
4320         case nir_op_f2i16:
4321         case nir_op_f2u16:
4322                 return 1;
4323         default:
4324                 break;
4325         }
4326 
4327         /* Vectorized instructions cannot write more than 32-bit */
4328         int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
4329         if (dst_bit_size == 16)
4330                 return 2;
4331         else
4332                 return 1;
4333 }
4334 
4335 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4336 bi_scalarize_filter(const nir_instr *instr, const void *data)
4337 {
4338         if (instr->type != nir_instr_type_alu)
4339                 return false;
4340 
4341         const nir_alu_instr *alu = nir_instr_as_alu(instr);
4342 
4343         switch (alu->op) {
4344         case nir_op_pack_uvec2_to_uint:
4345         case nir_op_pack_uvec4_to_uint:
4346                 return false;
4347         default:
4348                 return true;
4349         }
4350 }
4351 
4352 /* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
4353  * keep divergence info around after we consume it for indirect lowering,
4354  * nir_convert_from_ssa will regress code quality since it will avoid
4355  * coalescing divergent with non-divergent nodes. */
4356 
4357 static bool
nir_invalidate_divergence_ssa(nir_ssa_def * ssa,UNUSED void * data)4358 nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
4359 {
4360         ssa->divergent = false;
4361         return true;
4362 }
4363 
4364 static bool
nir_invalidate_divergence(struct nir_builder * b,nir_instr * instr,UNUSED void * data)4365 nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
4366                 UNUSED void *data)
4367 {
4368         return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
4369 }
4370 
4371 /* Ensure we write exactly 4 components */
4372 static nir_ssa_def *
bifrost_nir_valid_channel(nir_builder * b,nir_ssa_def * in,unsigned channel,unsigned first,unsigned mask)4373 bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
4374                           unsigned channel, unsigned first, unsigned mask)
4375 {
4376         if (!(mask & BITFIELD_BIT(channel)))
4377                 channel = first;
4378 
4379         return nir_channel(b, in, channel);
4380 }
4381 
4382 /* Lower fragment store_output instructions to always write 4 components,
4383  * matching the hardware semantic. This may require additional moves. Skipping
4384  * these moves is possible in theory, but invokes undefined behaviour in the
4385  * compiler. The DDK inserts these moves, so we will as well. */
4386 
4387 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_instr * instr,void * data)4388 bifrost_nir_lower_blend_components(struct nir_builder *b,
4389                                    nir_instr *instr, void *data)
4390 {
4391         if (instr->type != nir_instr_type_intrinsic)
4392                 return false;
4393 
4394         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4395 
4396         if (intr->intrinsic != nir_intrinsic_store_output)
4397                 return false;
4398 
4399         nir_ssa_def *in = intr->src[0].ssa;
4400         unsigned first = nir_intrinsic_component(intr);
4401         unsigned mask = nir_intrinsic_write_mask(intr);
4402 
4403         assert(first == 0 && "shouldn't get nonzero components");
4404 
4405         /* Nothing to do */
4406         if (mask == BITFIELD_MASK(4))
4407                 return false;
4408 
4409         b->cursor = nir_before_instr(&intr->instr);
4410 
4411         /* Replicate the first valid component instead */
4412         nir_ssa_def *replicated =
4413                 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4414                             bifrost_nir_valid_channel(b, in, 1, first, mask),
4415                             bifrost_nir_valid_channel(b, in, 2, first, mask),
4416                             bifrost_nir_valid_channel(b, in, 3, first, mask));
4417 
4418         /* Rewrite to use our replicated version */
4419         nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
4420         nir_intrinsic_set_component(intr, 0);
4421         nir_intrinsic_set_write_mask(intr, 0xF);
4422         intr->num_components = 4;
4423 
4424         return true;
4425 }
4426 
4427 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4428 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4429 {
4430         bool progress;
4431         unsigned lower_flrp = 16 | 32 | 64;
4432 
4433         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
4434 
4435         nir_lower_tex_options lower_tex_options = {
4436                 .lower_txs_lod = true,
4437                 .lower_txp = ~0,
4438                 .lower_tg4_broadcom_swizzle = true,
4439                 .lower_txd = true,
4440                 .lower_invalid_implicit_lod = true,
4441         };
4442 
4443         NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
4444         NIR_PASS(progress, nir, pan_lower_helper_invocation);
4445 
4446         NIR_PASS(progress, nir, nir_lower_int64);
4447 
4448         nir_lower_idiv_options idiv_options = {
4449                 .imprecise_32bit_lowering = true,
4450                 .allow_fp16 = true,
4451         };
4452         NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
4453 
4454         NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
4455         NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4456         NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4457 
4458         do {
4459                 progress = false;
4460 
4461                 NIR_PASS(progress, nir, nir_lower_var_copies);
4462                 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4463                 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4464 
4465                 NIR_PASS(progress, nir, nir_copy_prop);
4466                 NIR_PASS(progress, nir, nir_opt_remove_phis);
4467                 NIR_PASS(progress, nir, nir_opt_dce);
4468                 NIR_PASS(progress, nir, nir_opt_dead_cf);
4469                 NIR_PASS(progress, nir, nir_opt_cse);
4470                 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4471                 NIR_PASS(progress, nir, nir_opt_algebraic);
4472                 NIR_PASS(progress, nir, nir_opt_constant_folding);
4473 
4474                 NIR_PASS(progress, nir, nir_lower_alu);
4475 
4476                 if (lower_flrp != 0) {
4477                         bool lower_flrp_progress = false;
4478                         NIR_PASS(lower_flrp_progress,
4479                                  nir,
4480                                  nir_lower_flrp,
4481                                  lower_flrp,
4482                                  false /* always_precise */);
4483                         if (lower_flrp_progress) {
4484                                 NIR_PASS(progress, nir,
4485                                          nir_opt_constant_folding);
4486                                 progress = true;
4487                         }
4488 
4489                         /* Nothing should rematerialize any flrps, so we only
4490                          * need to do this lowering once.
4491                          */
4492                         lower_flrp = 0;
4493                 }
4494 
4495                 NIR_PASS(progress, nir, nir_opt_undef);
4496                 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4497 
4498                 NIR_PASS(progress, nir, nir_opt_shrink_vectors);
4499                 NIR_PASS(progress, nir, nir_opt_loop_unroll);
4500         } while (progress);
4501 
4502         /* TODO: Why is 64-bit getting rematerialized?
4503          * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4504         NIR_PASS(progress, nir, nir_lower_int64);
4505 
4506         /* We need to cleanup after each iteration of late algebraic
4507          * optimizations, since otherwise NIR can produce weird edge cases
4508          * (like fneg of a constant) which we don't handle */
4509         bool late_algebraic = true;
4510         while (late_algebraic) {
4511                 late_algebraic = false;
4512                 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4513                 NIR_PASS(progress, nir, nir_opt_constant_folding);
4514                 NIR_PASS(progress, nir, nir_copy_prop);
4515                 NIR_PASS(progress, nir, nir_opt_dce);
4516                 NIR_PASS(progress, nir, nir_opt_cse);
4517         }
4518 
4519         NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4520         NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
4521         NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4522         NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4523 
4524         /* Prepass to simplify instruction selection */
4525         late_algebraic = false;
4526         NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4527 
4528         while (late_algebraic) {
4529                 late_algebraic = false;
4530                 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4531                 NIR_PASS(progress, nir, nir_opt_constant_folding);
4532                 NIR_PASS(progress, nir, nir_copy_prop);
4533                 NIR_PASS(progress, nir, nir_opt_dce);
4534                 NIR_PASS(progress, nir, nir_opt_cse);
4535         }
4536 
4537         NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4538         NIR_PASS(progress, nir, nir_opt_dce);
4539 
4540         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4541                 NIR_PASS_V(nir, nir_shader_instructions_pass,
4542                            bifrost_nir_lower_blend_components,
4543                            nir_metadata_block_index | nir_metadata_dominance,
4544                            NULL);
4545         }
4546 
4547         /* Backend scheduler is purely local, so do some global optimizations
4548          * to reduce register pressure. */
4549         nir_move_options move_all =
4550                 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
4551                 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
4552 
4553         NIR_PASS_V(nir, nir_opt_sink, move_all);
4554         NIR_PASS_V(nir, nir_opt_move, move_all);
4555 
4556         /* We might lower attribute, varying, and image indirects. Use the
4557          * gathered info to skip the extra analysis in the happy path. */
4558         bool any_indirects =
4559                 nir->info.inputs_read_indirectly ||
4560                 nir->info.outputs_accessed_indirectly ||
4561                 nir->info.patch_inputs_read_indirectly ||
4562                 nir->info.patch_outputs_accessed_indirectly ||
4563                 nir->info.images_used[0];
4564 
4565         if (any_indirects) {
4566                 nir_convert_to_lcssa(nir, true, true);
4567                 NIR_PASS_V(nir, nir_divergence_analysis);
4568                 NIR_PASS_V(nir, bi_lower_divergent_indirects,
4569                                 pan_subgroup_size(gpu_id >> 12));
4570                 NIR_PASS_V(nir, nir_shader_instructions_pass,
4571                         nir_invalidate_divergence, nir_metadata_all, NULL);
4572         }
4573 }
4574 
4575 /* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
4576  * same lowering here to zero-extend correctly */
4577 
4578 static bool
bifrost_nir_lower_i8_fragout_impl(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4579 bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
4580                 nir_intrinsic_instr *intr, UNUSED void *data)
4581 {
4582         if (nir_src_bit_size(intr->src[0]) != 8)
4583                 return false;
4584 
4585         nir_alu_type type =
4586                 nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
4587 
4588         assert(type == nir_type_int || type == nir_type_uint);
4589 
4590         b->cursor = nir_before_instr(&intr->instr);
4591         nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
4592 
4593         nir_intrinsic_set_src_type(intr, type | 16);
4594         nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
4595         return true;
4596 }
4597 
4598 static bool
bifrost_nir_lower_i8_fragin_impl(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4599 bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
4600                 nir_intrinsic_instr *intr, UNUSED void *data)
4601 {
4602         if (nir_dest_bit_size(intr->dest) != 8)
4603                 return false;
4604 
4605         nir_alu_type type =
4606                 nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
4607 
4608         assert(type == nir_type_int || type == nir_type_uint);
4609 
4610         b->cursor = nir_before_instr(&intr->instr);
4611         nir_ssa_def *out =
4612                 nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
4613                         .base = nir_intrinsic_base(intr),
4614                         .component = nir_intrinsic_component(intr),
4615                         .dest_type = type | 16,
4616                         .io_semantics = nir_intrinsic_io_semantics(intr));
4617 
4618         nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
4619         nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
4620         return true;
4621 }
4622 
4623 static bool
bifrost_nir_lower_i8_frag(struct nir_builder * b,nir_instr * instr,UNUSED void * data)4624 bifrost_nir_lower_i8_frag(struct nir_builder *b,
4625                 nir_instr *instr, UNUSED void *data)
4626 {
4627         if (instr->type != nir_instr_type_intrinsic)
4628                 return false;
4629 
4630         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4631         if (intr->intrinsic == nir_intrinsic_load_output)
4632                 return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
4633         else if (intr->intrinsic == nir_intrinsic_store_output)
4634                 return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
4635         else
4636                 return false;
4637 }
4638 
4639 static void
bi_opt_post_ra(bi_context * ctx)4640 bi_opt_post_ra(bi_context *ctx)
4641 {
4642         bi_foreach_instr_global_safe(ctx, ins) {
4643                 if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
4644                         bi_remove_instruction(ins);
4645         }
4646 }
4647 
4648 /* If the shader packs multiple varyings into the same location with different
4649  * location_frac, we'll need to lower to a single varying store that collects
4650  * all of the channels together.
4651  */
4652 static bool
bifrost_nir_lower_store_component(struct nir_builder * b,nir_instr * instr,void * data)4653 bifrost_nir_lower_store_component(struct nir_builder *b,
4654                 nir_instr *instr, void *data)
4655 {
4656         if (instr->type != nir_instr_type_intrinsic)
4657                 return false;
4658 
4659         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4660 
4661         if (intr->intrinsic != nir_intrinsic_store_output)
4662                 return false;
4663 
4664         struct hash_table_u64 *slots = data;
4665         unsigned component = nir_intrinsic_component(intr);
4666         nir_src *slot_src = nir_get_io_offset_src(intr);
4667         uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
4668 
4669         nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
4670         unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
4671 
4672         nir_ssa_def *value = intr->src[0].ssa;
4673         b->cursor = nir_before_instr(&intr->instr);
4674 
4675         nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
4676         nir_ssa_def *channels[4] = { undef, undef, undef, undef };
4677 
4678         /* Copy old */
4679         u_foreach_bit(i, mask) {
4680                 assert(prev != NULL);
4681                 nir_ssa_def *prev_ssa = prev->src[0].ssa;
4682                 channels[i] = nir_channel(b, prev_ssa, i);
4683         }
4684 
4685         /* Copy new */
4686         unsigned new_mask = nir_intrinsic_write_mask(intr);
4687         mask |= (new_mask << component);
4688 
4689         u_foreach_bit(i, new_mask) {
4690                 assert(component + i < 4);
4691                 channels[component + i] = nir_channel(b, value, i);
4692         }
4693 
4694         intr->num_components = util_last_bit(mask);
4695         nir_instr_rewrite_src_ssa(instr, &intr->src[0],
4696                         nir_vec(b, channels, intr->num_components));
4697 
4698         nir_intrinsic_set_component(intr, 0);
4699         nir_intrinsic_set_write_mask(intr, mask);
4700 
4701         if (prev) {
4702                 _mesa_hash_table_u64_remove(slots, slot);
4703                 nir_instr_remove(&prev->instr);
4704         }
4705 
4706         _mesa_hash_table_u64_insert(slots, slot, intr);
4707         return false;
4708 }
4709 
4710 /* Dead code elimination for branches at the end of a block - only one branch
4711  * per block is legal semantically, but unreachable jumps can be generated.
4712  * Likewise on Bifrost we can generate jumps to the terminal block which need
4713  * to be lowered away to a jump to #0x0, which induces successful termination.
4714  * That trick doesn't work on Valhall, which needs a NOP inserted in the
4715  * terminal block instead.
4716  */
4717 static void
bi_lower_branch(bi_context * ctx,bi_block * block)4718 bi_lower_branch(bi_context *ctx, bi_block *block)
4719 {
4720         bool cull_terminal = (ctx->arch <= 8);
4721         bool branched = false;
4722         ASSERTED bool was_jump = false;
4723 
4724         bi_foreach_instr_in_block_safe(block, ins) {
4725                 if (!ins->branch_target) continue;
4726 
4727                 if (branched) {
4728                         assert(was_jump && (ins->op == BI_OPCODE_JUMP));
4729                         bi_remove_instruction(ins);
4730                         continue;
4731                 }
4732 
4733                 branched = true;
4734                 was_jump = ins->op == BI_OPCODE_JUMP;
4735 
4736                 if (!bi_is_terminal_block(ins->branch_target))
4737                         continue;
4738 
4739                 if (cull_terminal)
4740                         ins->branch_target = NULL;
4741                 else if (ins->branch_target)
4742                         ins->branch_target->needs_nop = true;
4743         }
4744 }
4745 
4746 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)4747 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4748 {
4749         unsigned final_clause = bi_pack(ctx, binary);
4750 
4751         /* If we need to wait for ATEST or BLEND in the first clause, pass the
4752          * corresponding bits through to the renderer state descriptor */
4753         bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4754         bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4755 
4756         unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4757         ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4758         ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4759 
4760         /* Pad the shader with enough zero bytes to trick the prefetcher,
4761          * unless we're compiling an empty shader (in which case we don't pad
4762          * so the size remains 0) */
4763         unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4764 
4765         if (binary->size - offset) {
4766                 memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
4767                        0, prefetch_size);
4768         }
4769 }
4770 
4771 /*
4772  * Build a bit mask of varyings (by location) that are flatshaded. This
4773  * information is needed by lower_mediump_io, as we don't yet support 16-bit
4774  * flat varyings.
4775  *
4776  * Also varyings that are used as texture coordinates should be kept at fp32 so
4777  * the texture instruction may be promoted to VAR_TEX. In general this is a good
4778  * idea, as fp16 texture coordinates are not supported by the hardware and are
4779  * usually inappropriate. (There are both relevant CTS bugs here, even.)
4780  *
4781  * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4782  * we could implement 16-bit flat varyings. Consider if this case matters.
4783  *
4784  * TODO: The texture coordinate handling could be less heavyhanded.
4785  */
4786 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)4787 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4788 {
4789         uint64_t *mask = data;
4790 
4791         if (instr->type != nir_instr_type_tex)
4792                 return false;
4793 
4794         nir_tex_instr *tex = nir_instr_as_tex(instr);
4795 
4796         int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4797         if (coord_idx < 0)
4798                 return false;
4799 
4800         nir_src src = tex->src[coord_idx].src;
4801         assert(src.is_ssa);
4802 
4803         nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0);
4804         nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1);
4805 
4806         if (x.def != y.def)
4807                 return false;
4808 
4809         nir_instr *parent = x.def->parent_instr;
4810 
4811         if (parent->type != nir_instr_type_intrinsic)
4812                 return false;
4813 
4814         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4815 
4816         if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4817                 return false;
4818 
4819         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4820         *mask |= BITFIELD64_BIT(sem.location);
4821         return false;
4822 }
4823 
4824 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)4825 bi_fp32_varying_mask(nir_shader *nir)
4826 {
4827         uint64_t mask = 0;
4828 
4829         assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4830 
4831         nir_foreach_shader_in_variable(var, nir) {
4832                 if (var->data.interpolation == INTERP_MODE_FLAT)
4833                         mask |= BITFIELD64_BIT(var->data.location);
4834         }
4835 
4836         nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask);
4837 
4838         return mask;
4839 }
4840 
4841 static void
bi_finalize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4842 bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4843 {
4844         /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
4845          * (so we don't accidentally duplicate the epilogue since mesa/st has
4846          * messed with our I/O quite a bit already) */
4847 
4848         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4849 
4850         if (nir->info.stage == MESA_SHADER_VERTEX) {
4851                 NIR_PASS_V(nir, nir_lower_viewport_transform);
4852                 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
4853 
4854                 nir_variable *psiz = nir_find_variable_with_location(nir,
4855                                                                      nir_var_shader_out,
4856                                                                      VARYING_SLOT_PSIZ);
4857                 if (psiz != NULL)
4858                         psiz->data.precision = GLSL_PRECISION_MEDIUM;
4859         }
4860 
4861         /* Get rid of any global vars before we lower to scratch. */
4862         NIR_PASS_V(nir, nir_lower_global_vars_to_local);
4863 
4864         /* Valhall introduces packed thread local storage, which improves cache
4865          * locality of TLS access. However, access to packed TLS cannot
4866          * straddle 16-byte boundaries. As such, when packed TLS is in use
4867          * (currently unconditional for Valhall), we force vec4 alignment for
4868          * scratch access.
4869          */
4870         bool packed_tls = (gpu_id >= 0x9000);
4871 
4872         /* Lower large arrays to scratch and small arrays to bcsel */
4873         NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
4874                         packed_tls ?
4875                         glsl_get_vec4_size_align_bytes :
4876                         glsl_get_natural_size_align_bytes);
4877         NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
4878 
4879         NIR_PASS_V(nir, nir_split_var_copies);
4880         NIR_PASS_V(nir, nir_lower_var_copies);
4881         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4882         NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
4883                         glsl_type_size, 0);
4884 
4885         /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
4886          * offsets it could figure out are constant.  Do some constant folding
4887          * before bifrost_nir_lower_store_component below.
4888          */
4889         NIR_PASS_V(nir, nir_opt_constant_folding);
4890 
4891         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4892                 NIR_PASS_V(nir, nir_lower_mediump_io,
4893                            nir_var_shader_in | nir_var_shader_out,
4894                            ~bi_fp32_varying_mask(nir), false);
4895         } else {
4896                 if (gpu_id >= 0x9000) {
4897                         NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
4898                                         BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
4899                 }
4900 
4901                 struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL);
4902                 NIR_PASS_V(nir, nir_shader_instructions_pass,
4903                                 bifrost_nir_lower_store_component,
4904                                 nir_metadata_block_index |
4905                                 nir_metadata_dominance, stores);
4906                 _mesa_hash_table_u64_destroy(stores);
4907         }
4908 
4909         NIR_PASS_V(nir, nir_lower_ssbo);
4910         NIR_PASS_V(nir, pan_nir_lower_zs_store);
4911         NIR_PASS_V(nir, pan_lower_sample_pos);
4912         NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
4913 
4914         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4915                 NIR_PASS_V(nir, nir_shader_instructions_pass,
4916                                 bifrost_nir_lower_i8_frag,
4917                                 nir_metadata_block_index | nir_metadata_dominance,
4918                                 NULL);
4919         }
4920 
4921         if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) {
4922                 NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
4923                            nir_var_shader_in | nir_var_shader_out);
4924                 NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info);
4925                 NIR_PASS_V(nir, pan_lower_xfb);
4926         }
4927 
4928         bi_optimize_nir(nir, gpu_id, is_blend);
4929 }
4930 
4931 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct hash_table_u64 * sysval_to_id,struct bi_shader_info info,enum bi_idvs_mode idvs)4932 bi_compile_variant_nir(nir_shader *nir,
4933                        const struct panfrost_compile_inputs *inputs,
4934                        struct util_dynarray *binary,
4935                        struct hash_table_u64 *sysval_to_id,
4936                        struct bi_shader_info info,
4937                        enum bi_idvs_mode idvs)
4938 {
4939         bi_context *ctx = rzalloc(NULL, bi_context);
4940 
4941         /* There may be another program in the dynarray, start at the end */
4942         unsigned offset = binary->size;
4943 
4944         ctx->sysval_to_id = sysval_to_id;
4945         ctx->inputs = inputs;
4946         ctx->nir = nir;
4947         ctx->stage = nir->info.stage;
4948         ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
4949         ctx->arch = inputs->gpu_id >> 12;
4950         ctx->info = info;
4951         ctx->idvs = idvs;
4952         ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
4953 
4954         if (idvs != BI_IDVS_NONE) {
4955                 /* Specializing shaders for IDVS is destructive, so we need to
4956                  * clone. However, the last (second) IDVS shader does not need
4957                  * to be preserved so we can skip cloning that one.
4958                  */
4959                 if (offset == 0)
4960                         ctx->nir = nir = nir_shader_clone(ctx, nir);
4961 
4962                 NIR_PASS_V(nir, nir_shader_instructions_pass,
4963                            bifrost_nir_specialize_idvs,
4964                            nir_metadata_block_index | nir_metadata_dominance,
4965                            &idvs);
4966 
4967                 /* After specializing, clean up the mess */
4968                 bool progress = true;
4969 
4970                 while (progress) {
4971                         progress = false;
4972 
4973                         NIR_PASS(progress, nir, nir_opt_dce);
4974                         NIR_PASS(progress, nir, nir_opt_dead_cf);
4975                 }
4976         }
4977 
4978         /* We can only go out-of-SSA after speciailizing IDVS, as opt_dead_cf
4979          * doesn't know how to deal with nir_register.
4980          */
4981         NIR_PASS_V(nir, nir_convert_from_ssa, true);
4982 
4983         /* If nothing is pushed, all UBOs need to be uploaded */
4984         ctx->ubo_mask = ~0;
4985 
4986         list_inithead(&ctx->blocks);
4987 
4988         bool skip_internal = nir->info.internal;
4989         skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
4990 
4991         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4992                 nir_print_shader(nir, stdout);
4993         }
4994 
4995         ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
4996 
4997         nir_foreach_function(func, nir) {
4998                 if (!func->impl)
4999                         continue;
5000 
5001                 ctx->ssa_alloc += func->impl->ssa_alloc;
5002                 ctx->reg_alloc += func->impl->reg_alloc;
5003 
5004                 emit_cf_list(ctx, &func->impl->body);
5005                 break; /* TODO: Multi-function shaders */
5006         }
5007 
5008         /* Index blocks now that we're done emitting */
5009         bi_foreach_block(ctx, block) {
5010                 block->index = ctx->num_blocks++;
5011         }
5012 
5013         bi_validate(ctx, "NIR -> BIR");
5014 
5015         /* If the shader doesn't write any colour or depth outputs, it may
5016          * still need an ATEST at the very end! */
5017         bool need_dummy_atest =
5018                 (ctx->stage == MESA_SHADER_FRAGMENT) &&
5019                 !ctx->emitted_atest &&
5020                 !bi_skip_atest(ctx, false);
5021 
5022         if (need_dummy_atest) {
5023                 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5024                 bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5025                 bi_emit_atest(&b, bi_zero());
5026         }
5027 
5028         bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5029 
5030         /* Runs before constant folding */
5031         bi_lower_swizzle(ctx);
5032         bi_validate(ctx, "Early lowering");
5033 
5034         /* Runs before copy prop */
5035         if (optimize && !ctx->inputs->no_ubo_to_push) {
5036                 bi_opt_push_ubo(ctx);
5037         }
5038 
5039         if (likely(optimize)) {
5040                 bi_opt_copy_prop(ctx);
5041 
5042                 while (bi_opt_constant_fold(ctx))
5043                         bi_opt_copy_prop(ctx);
5044 
5045                 bi_opt_mod_prop_forward(ctx);
5046                 bi_opt_mod_prop_backward(ctx);
5047 
5048                 /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5049                  * mod_prop_backward to fuse VAR_TEX */
5050                 if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5051                     !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5052                         bi_opt_dead_code_eliminate(ctx);
5053                         bi_opt_message_preload(ctx);
5054                         bi_opt_copy_prop(ctx);
5055                 }
5056 
5057                 bi_opt_dead_code_eliminate(ctx);
5058                 bi_opt_cse(ctx);
5059                 bi_opt_dead_code_eliminate(ctx);
5060                 if (!ctx->inputs->no_ubo_to_push)
5061                         bi_opt_reorder_push(ctx);
5062                 bi_validate(ctx, "Optimization passes");
5063         }
5064 
5065         bi_foreach_instr_global(ctx, I) {
5066                 bi_lower_opt_instruction(I);
5067         }
5068 
5069         if (ctx->arch >= 9) {
5070                 va_optimize(ctx);
5071 
5072                 bi_foreach_instr_global_safe(ctx, I) {
5073                         va_lower_isel(I);
5074                         va_lower_constants(ctx, I);
5075 
5076                         bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5077                         va_repair_fau(&b, I);
5078                 }
5079 
5080                 /* We need to clean up after constant lowering */
5081                 if (likely(optimize)) {
5082                         bi_opt_cse(ctx);
5083                         bi_opt_dead_code_eliminate(ctx);
5084                 }
5085 
5086                 bi_validate(ctx, "Valhall passes");
5087         }
5088 
5089         bi_foreach_block(ctx, block) {
5090                 bi_lower_branch(ctx, block);
5091         }
5092 
5093         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5094                 bi_print_shader(ctx, stdout);
5095 
5096         /* Analyze before register allocation to avoid false dependencies. The
5097          * skip bit is a function of only the data flow graph and is invariant
5098          * under valid scheduling. Helpers are only defined for fragment
5099          * shaders, so this analysis is only required in fragment shaders.
5100          */
5101         if (ctx->stage == MESA_SHADER_FRAGMENT)
5102                 bi_analyze_helper_requirements(ctx);
5103 
5104         /* Fuse TEXC after analyzing helper requirements so the analysis
5105          * doesn't have to know about dual textures */
5106         if (likely(optimize)) {
5107                 bi_opt_fuse_dual_texture(ctx);
5108         }
5109 
5110         /* Lower FAU after fusing dual texture, because fusing dual texture
5111          * creates new immediates that themselves may need lowering.
5112          */
5113         if (ctx->arch <= 8) {
5114                 bi_lower_fau(ctx);
5115         }
5116 
5117         /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5118         if (likely(optimize)) {
5119                 bi_opt_cse(ctx);
5120                 bi_opt_dead_code_eliminate(ctx);
5121         }
5122 
5123         if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED)))
5124                 bi_pressure_schedule(ctx);
5125 
5126         bi_validate(ctx, "Late lowering");
5127 
5128         bi_register_allocate(ctx);
5129 
5130         if (likely(optimize))
5131                 bi_opt_post_ra(ctx);
5132 
5133         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5134                 bi_print_shader(ctx, stdout);
5135 
5136         if (ctx->arch >= 9) {
5137                 va_assign_slots(ctx);
5138                 va_insert_flow_control_nops(ctx);
5139                 va_merge_flow(ctx);
5140                 va_mark_last(ctx);
5141         } else {
5142                 bi_schedule(ctx);
5143                 bi_assign_scoreboard(ctx);
5144 
5145                 /* Analyze after scheduling since we depend on instruction
5146                  * order. Valhall calls as part of va_insert_flow_control_nops,
5147                  * as the handling for clauses differs from instructions.
5148                  */
5149                 bi_analyze_helper_terminate(ctx);
5150                 bi_mark_clauses_td(ctx);
5151         }
5152 
5153         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5154                 bi_print_shader(ctx, stdout);
5155 
5156         if (ctx->arch <= 8) {
5157                 bi_pack_clauses(ctx, binary, offset);
5158         } else {
5159                 bi_pack_valhall(ctx, binary);
5160         }
5161 
5162         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5163                 if (ctx->arch <= 8) {
5164                         disassemble_bifrost(stdout, binary->data + offset,
5165                                             binary->size - offset,
5166                                             bifrost_debug & BIFROST_DBG_VERBOSE);
5167                 } else {
5168                         disassemble_valhall(stdout, binary->data + offset,
5169                                             binary->size - offset,
5170                                             bifrost_debug & BIFROST_DBG_VERBOSE);
5171                 }
5172 
5173                 fflush(stdout);
5174         }
5175 
5176         if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
5177             !skip_internal) {
5178                 if (ctx->arch >= 9) {
5179                         va_print_stats(ctx, binary->size - offset, stderr);
5180                 } else {
5181                         bi_print_stats(ctx, binary->size - offset, stderr);
5182                 }
5183         }
5184 
5185         return ctx;
5186 }
5187 
5188 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct hash_table_u64 * sysval_to_id,struct pan_shader_info * info,enum bi_idvs_mode idvs)5189 bi_compile_variant(nir_shader *nir,
5190                    const struct panfrost_compile_inputs *inputs,
5191                    struct util_dynarray *binary,
5192                    struct hash_table_u64 *sysval_to_id,
5193                    struct pan_shader_info *info,
5194                    enum bi_idvs_mode idvs)
5195 {
5196         struct bi_shader_info local_info = {
5197                 .push = &info->push,
5198                 .bifrost = &info->bifrost,
5199                 .tls_size = info->tls_size,
5200                 .sysvals = &info->sysvals,
5201                 .push_offset = info->push.count
5202         };
5203 
5204         unsigned offset = binary->size;
5205 
5206         /* If there is no position shader (gl_Position is not written), then
5207          * there is no need to build a varying shader either. This case is hit
5208          * for transform feedback only vertex shaders which only make sense with
5209          * rasterizer discard.
5210          */
5211         if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5212                 return;
5213 
5214         /* Software invariant: Only a secondary shader can appear at a nonzero
5215          * offset, to keep the ABI simple. */
5216         assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5217 
5218         bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs);
5219 
5220         /* A register is preloaded <==> it is live before the first block */
5221         bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5222         uint64_t preload = first_block->reg_live_in;
5223 
5224         /* If multisampling is used with a blend shader, the blend shader needs
5225          * to access the sample coverage mask in r60 and the sample ID in r61.
5226          * Blend shaders run in the same context as fragment shaders, so if a
5227          * blend shader could run, we need to preload these registers
5228          * conservatively. There is believed to be little cost to doing so, so
5229          * do so always to avoid variants of the preload descriptor.
5230          *
5231          * We only do this on Valhall, as Bifrost has to update the RSD for
5232          * multisampling w/ blend shader anyway, so this is handled in the
5233          * driver. We could unify the paths if the cost is acceptable.
5234          */
5235         if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5236                 preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5237 
5238         info->ubo_mask |= ctx->ubo_mask;
5239         info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5240 
5241         if (idvs == BI_IDVS_VARYING) {
5242                 info->vs.secondary_enable = (binary->size > offset);
5243                 info->vs.secondary_offset = offset;
5244                 info->vs.secondary_preload = preload;
5245                 info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5246         } else {
5247                 info->preload = preload;
5248                 info->work_reg_count = ctx->info.work_reg_count;
5249         }
5250 
5251         if (idvs == BI_IDVS_POSITION &&
5252             !nir->info.internal &&
5253             nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5254                 /* Find the psiz write */
5255                 bi_instr *write = NULL;
5256 
5257                 bi_foreach_instr_global(ctx, I) {
5258                         if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5259                                 write = I;
5260                                 break;
5261                         }
5262                 }
5263 
5264                 assert(write != NULL);
5265 
5266                 /* NOP it out, preserving its flow control. TODO: maybe DCE */
5267                 if (write->flow) {
5268                         bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5269                         bi_instr *nop = bi_nop(&b);
5270                         nop->flow = write->flow;
5271                 }
5272 
5273                 bi_remove_instruction(write);
5274 
5275                 info->vs.no_psiz_offset = binary->size;
5276                 bi_pack_valhall(ctx, binary);
5277         }
5278 
5279         ralloc_free(ctx);
5280 }
5281 
5282 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5283 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5284 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5285 {
5286         /* Opt-out */
5287         if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5288                 return false;
5289 
5290         /* IDVS splits up vertex shaders, not defined on other shader stages */
5291         if (nir->info.stage != MESA_SHADER_VERTEX)
5292                 return false;
5293 
5294         /* Bifrost cannot write gl_PointSize during IDVS */
5295         if ((inputs->gpu_id < 0x9000) &&
5296             nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5297                 return false;
5298 
5299         /* Otherwise, IDVS is usually better */
5300         return true;
5301 }
5302 
5303 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5304 bifrost_compile_shader_nir(nir_shader *nir,
5305                            const struct panfrost_compile_inputs *inputs,
5306                            struct util_dynarray *binary,
5307                            struct pan_shader_info *info)
5308 {
5309         bifrost_debug = debug_get_option_bifrost_debug();
5310 
5311         bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend);
5312         struct hash_table_u64 *sysval_to_id =
5313                 panfrost_init_sysvals(&info->sysvals,
5314                                       inputs->fixed_sysval_layout,
5315                                       NULL);
5316 
5317         info->tls_size = nir->scratch_size;
5318         info->vs.idvs = bi_should_idvs(nir, inputs);
5319 
5320         if (info->vs.idvs) {
5321                 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION);
5322                 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING);
5323         } else {
5324                 bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE);
5325         }
5326 
5327         if (gl_shader_stage_is_compute(nir->info.stage)) {
5328                 /* Workgroups may be merged if the structure of the workgroup is
5329                  * not software visible. This is true if neither shared memory
5330                  * nor barriers are used. The hardware may be able to optimize
5331                  * compute shaders that set this flag.
5332                  */
5333                 info->cs.allow_merging_workgroups =
5334                         (nir->info.shared_size == 0) &&
5335                         !nir->info.uses_control_barrier &&
5336                         !nir->info.uses_memory_barrier;
5337         }
5338 
5339         info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5340 
5341         _mesa_hash_table_u64_destroy(sysval_to_id);
5342 }
5343