1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors (Collabora):
25 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26 */
27
28 #include "compiler/glsl_types.h"
29 #include "compiler/glsl/glsl_to_nir.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32
33 #include "bifrost/disassemble.h"
34 #include "valhall/disassemble.h"
35 #include "valhall/va_compiler.h"
36 #include "bi_builder.h"
37 #include "bi_quirks.h"
38 #include "bifrost_compile.h"
39 #include "bifrost_nir.h"
40 #include "compiler.h"
41
42 /* clang-format off */
43 static const struct debug_named_value bifrost_debug_options[] = {
44 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
45 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
46 {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},
47 {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},
48 {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},
49 {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},
50 {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"},
51 {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},
52 {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
53 {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"},
54 {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"},
55 {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"},
56 {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"},
57 {"spill", BIFROST_DBG_SPILL, "Test register spilling"},
58 DEBUG_NAMED_VALUE_END
59 };
60 /* clang-format on */
61
62 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
63 bifrost_debug_options, 0)
64
65 /* How many bytes are prefetched by the Bifrost shader core. From the final
66 * clause of the shader, this range must be valid instructions or zero. */
67 #define BIFROST_SHADER_PREFETCH 128
68
69 int bifrost_debug = 0;
70
71 #define DBG(fmt, ...) \
72 do { \
73 if (bifrost_debug & BIFROST_DBG_MSGS) \
74 fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
75 } while (0)
76
77 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
78
79 static bi_index
bi_preload(bi_builder * b,unsigned reg)80 bi_preload(bi_builder *b, unsigned reg)
81 {
82 if (bi_is_null(b->shader->preloaded[reg])) {
83 /* Insert at the beginning of the shader */
84 bi_builder b_ = *b;
85 b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
86
87 /* Cache the result */
88 b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
89 }
90
91 return b->shader->preloaded[reg];
92 }
93
94 static bi_index
bi_coverage(bi_builder * b)95 bi_coverage(bi_builder *b)
96 {
97 if (bi_is_null(b->shader->coverage))
98 b->shader->coverage = bi_preload(b, 60);
99
100 return b->shader->coverage;
101 }
102
103 /*
104 * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
105 * changed from Bifrost to Valhall. Provide helpers that smooth over the
106 * architectural difference.
107 */
108 static inline bi_index
bi_vertex_id(bi_builder * b)109 bi_vertex_id(bi_builder *b)
110 {
111 return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
112 }
113
114 static inline bi_index
bi_instance_id(bi_builder * b)115 bi_instance_id(bi_builder *b)
116 {
117 return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
118 }
119
120 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)121 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
122 {
123 bi_instr *branch = bi_jump(b, bi_zero());
124
125 switch (instr->type) {
126 case nir_jump_break:
127 branch->branch_target = b->shader->break_block;
128 break;
129 case nir_jump_continue:
130 branch->branch_target = b->shader->continue_block;
131 break;
132 default:
133 unreachable("Unhandled jump type");
134 }
135
136 bi_block_add_successor(b->shader->current_block, branch->branch_target);
137 b->shader->current_block->unconditional_jumps = true;
138 }
139
140 /* Builds a 64-bit hash table key for an index */
141 static uint64_t
bi_index_to_key(bi_index idx)142 bi_index_to_key(bi_index idx)
143 {
144 static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
145
146 uint64_t key = 0;
147 memcpy(&key, &idx, sizeof(idx));
148 return key;
149 }
150
151 /*
152 * Extract a single channel out of a vector source. We split vectors with SPLIT
153 * so we can use the split components directly, without emitting an extract.
154 * This has advantages of RA, as the split can usually be optimized away.
155 */
156 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)157 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
158 {
159 bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
160 bi_index_to_key(vec));
161
162 /* No extract needed for scalars.
163 *
164 * This is a bit imprecise, but actual bugs (missing splits for vectors)
165 * should be caught by the following assertion. It is too difficult to
166 * ensure bi_extract is only called for real vectors.
167 */
168 if (components == NULL && channel == 0)
169 return vec;
170
171 assert(components != NULL && "missing bi_cache_collect()");
172 return components[channel];
173 }
174
175 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)176 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
177 {
178 /* Lifetime of a hash table entry has to be at least as long as the table */
179 bi_index *channels = ralloc_array(b->shader, bi_index, n);
180 memcpy(channels, s, sizeof(bi_index) * n);
181
182 _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
183 channels);
184 }
185
186 /*
187 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
188 * split pseudo-instruction.
189 *
190 * Pre-condition: dests is filled with bi_null().
191 */
192 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)193 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
194 {
195 /* Setup the destinations */
196 for (unsigned i = 0; i < n; ++i) {
197 dests[i] = bi_temp(b->shader);
198 }
199
200 /* Emit the split */
201 if (n == 1) {
202 bi_mov_i32_to(b, dests[0], vec);
203 } else {
204 bi_instr *I = bi_split_i32_to(b, n, vec);
205
206 bi_foreach_dest(I, j)
207 I->dest[j] = dests[j];
208 }
209 }
210
211 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)212 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
213 {
214 bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
215 bi_emit_split_i32(b, dests, vec, n);
216 bi_cache_collect(b, vec, dests, n);
217 }
218
219 /*
220 * Emit and cache a split for a vector of a given bitsize. The vector may not be
221 * composed of 32-bit words, but it will be split at 32-bit word boundaries.
222 */
223 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)224 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
225 {
226 bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
227 }
228
229 static void
bi_split_def(bi_builder * b,nir_def * def)230 bi_split_def(bi_builder *b, nir_def *def)
231 {
232 bi_emit_cached_split(b, bi_def_index(def),
233 def->bit_size * def->num_components);
234 }
235
236 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)237 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
238 {
239 /* Special case: COLLECT of a single value is a scalar move */
240 if (n == 1)
241 return bi_mov_i32_to(b, dst, chan[0]);
242
243 bi_instr *I = bi_collect_i32_to(b, dst, n);
244
245 bi_foreach_src(I, i)
246 I->src[i] = chan[i];
247
248 bi_cache_collect(b, dst, chan, n);
249 return I;
250 }
251
252 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)253 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
254 {
255 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
256 }
257
258 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)259 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
260 bi_index s2)
261 {
262 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
263 }
264
265 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)266 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
267 {
268 bi_index dst = bi_temp(b->shader);
269 bi_collect_v2i32_to(b, dst, s0, s1);
270 return dst;
271 }
272
273 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)274 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
275 {
276 switch (intr->intrinsic) {
277 case nir_intrinsic_load_barycentric_centroid:
278 case nir_intrinsic_load_barycentric_sample:
279 return bi_preload(b, 61);
280
281 /* Need to put the sample ID in the top 16-bits */
282 case nir_intrinsic_load_barycentric_at_sample:
283 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
284 bi_half(bi_src_index(&intr->src[0]), false));
285
286 /* Interpret as 8:8 signed fixed point positions in pixels along X and
287 * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
288 * is the center of the pixel so we first fixup and then convert. For
289 * fp16 input:
290 *
291 * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
292 * f2i16((256 * (x, y)) + (128, 128)) =
293 * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
294 *
295 * For fp32 input, that lacks enough precision for MSAA 16x, but the
296 * idea is the same. FIXME: still doesn't pass
297 */
298 case nir_intrinsic_load_barycentric_at_offset: {
299 bi_index offset = bi_src_index(&intr->src[0]);
300 bi_index f16 = bi_null();
301 unsigned sz = nir_src_bit_size(intr->src[0]);
302
303 if (sz == 16) {
304 f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
305 } else {
306 assert(sz == 32);
307 bi_index f[2];
308 for (unsigned i = 0; i < 2; ++i) {
309 f[i] =
310 bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
311 bi_imm_u32(8), BI_SPECIAL_NONE);
312 }
313
314 f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
315 }
316
317 return bi_v2f16_to_v2s16(b, f16);
318 }
319
320 case nir_intrinsic_load_barycentric_pixel:
321 default:
322 return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
323 }
324 }
325
326 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)327 bi_interp_for_intrinsic(nir_intrinsic_op op)
328 {
329 switch (op) {
330 case nir_intrinsic_load_barycentric_centroid:
331 return BI_SAMPLE_CENTROID;
332 case nir_intrinsic_load_barycentric_sample:
333 case nir_intrinsic_load_barycentric_at_sample:
334 return BI_SAMPLE_SAMPLE;
335 case nir_intrinsic_load_barycentric_at_offset:
336 return BI_SAMPLE_EXPLICIT;
337 case nir_intrinsic_load_barycentric_pixel:
338 default:
339 return BI_SAMPLE_CENTER;
340 }
341 }
342
343 /* auto, 64-bit omitted */
344 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)345 bi_reg_fmt_for_nir(nir_alu_type T)
346 {
347 switch (T) {
348 case nir_type_float16:
349 return BI_REGISTER_FORMAT_F16;
350 case nir_type_float32:
351 return BI_REGISTER_FORMAT_F32;
352 case nir_type_int16:
353 return BI_REGISTER_FORMAT_S16;
354 case nir_type_uint16:
355 return BI_REGISTER_FORMAT_U16;
356 case nir_type_int32:
357 return BI_REGISTER_FORMAT_S32;
358 case nir_type_uint32:
359 return BI_REGISTER_FORMAT_U32;
360 default:
361 unreachable("Invalid type for register format");
362 }
363 }
364
365 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
366 * immediate to be used (which applies even if _IMM can't be used) */
367
368 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)369 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
370 unsigned max)
371 {
372 nir_src *offset = nir_get_io_offset_src(instr);
373
374 if (!nir_src_is_const(*offset))
375 return false;
376
377 *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
378 return (*immediate) < max;
379 }
380
381 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
382 unsigned *channel, unsigned count, unsigned bitsize);
383
384 /* Bifrost's load instructions lack a component offset despite operating in
385 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
386 * but they may be unavoidable with separate shaders in use. To solve this, we
387 * lower to a larger load and an explicit copy of the desired components. */
388
389 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)390 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
391 {
392 unsigned component = nir_intrinsic_component(instr);
393 unsigned nr = instr->num_components;
394 unsigned total = nr + component;
395 unsigned bitsize = instr->def.bit_size;
396
397 assert(total <= 4 && "should be vec4");
398 bi_emit_cached_split(b, tmp, total * bitsize);
399
400 if (component == 0)
401 return;
402
403 bi_index srcs[] = {tmp, tmp, tmp};
404 unsigned channels[] = {component, component + 1, component + 2};
405
406 bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
407 instr->def.bit_size);
408 }
409
410 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)411 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
412 {
413 /* Disregard the signedness of an integer, since loading 32-bits into a
414 * 32-bit register should be bit exact so should not incur any clamping.
415 *
416 * If we are reading as a u32, then it must be paired with an integer (u32 or
417 * s32) source, so use .auto32 to disregard.
418 */
419 nir_alu_type T = nir_intrinsic_dest_type(instr);
420 assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
421 enum bi_register_format regfmt =
422 T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
423
424 nir_src *offset = nir_get_io_offset_src(instr);
425 unsigned component = nir_intrinsic_component(instr);
426 enum bi_vecsize vecsize = (instr->num_components + component - 1);
427 unsigned imm_index = 0;
428 unsigned base = nir_intrinsic_base(instr);
429 bool constant = nir_src_is_const(*offset);
430 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
431 bi_index dest =
432 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
433 bi_instr *I;
434
435 if (immediate) {
436 I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt,
437 vecsize, imm_index);
438 } else {
439 bi_index idx = bi_src_index(&instr->src[0]);
440
441 if (constant)
442 idx = bi_imm_u32(imm_index);
443 else if (base != 0)
444 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
445
446 I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx,
447 regfmt, vecsize);
448 }
449
450 if (b->shader->arch >= 9)
451 I->table = PAN_TABLE_ATTRIBUTE;
452
453 bi_copy_component(b, instr, dest);
454 }
455
456 /*
457 * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
458 * come later, sparsely packed. This handles both linked and separable shaders
459 * with a common code path, with minimal keying only for desktop GL. Each slot
460 * consumes 16 bytes (TODO: fp16, partial vectors).
461 */
462 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)463 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
464 {
465 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
466 uint32_t mask = ctx->inputs->fixed_varying_mask;
467
468 if (sem.location >= VARYING_SLOT_VAR0) {
469 unsigned nr_special = util_bitcount(mask);
470 unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
471
472 return 16 * (nr_special + general_index);
473 } else {
474 return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
475 }
476 }
477
478 /*
479 * Compute the offset in bytes of a varying with an immediate offset, adding the
480 * offset to the base computed above. Convenience method.
481 */
482 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)483 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
484 {
485 nir_src *src = nir_get_io_offset_src(intr);
486 assert(nir_src_is_const(*src) && "assumes immediate offset");
487
488 return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
489 }
490
491 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)492 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
493 {
494 enum bi_sample sample = BI_SAMPLE_CENTER;
495 enum bi_update update = BI_UPDATE_STORE;
496 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
497 bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
498 bi_index src0 = bi_null();
499
500 unsigned component = nir_intrinsic_component(instr);
501 enum bi_vecsize vecsize = (instr->num_components + component - 1);
502 bi_index dest =
503 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
504
505 unsigned sz = instr->def.bit_size;
506
507 if (smooth) {
508 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
509 assert(parent);
510
511 sample = bi_interp_for_intrinsic(parent->intrinsic);
512 src0 = bi_varying_src0_for_barycentric(b, parent);
513
514 assert(sz == 16 || sz == 32);
515 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
516 } else {
517 assert(sz == 32);
518 regfmt = BI_REGISTER_FORMAT_U32;
519
520 /* Valhall can't have bi_null() here, although the source is
521 * logically unused for flat varyings
522 */
523 if (b->shader->arch >= 9)
524 src0 = bi_preload(b, 61);
525
526 /* Gather info as we go */
527 b->shader->info.bifrost->uses_flat_shading = true;
528 }
529
530 enum bi_source_format source_format =
531 smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
532
533 nir_src *offset = nir_get_io_offset_src(instr);
534 unsigned imm_index = 0;
535 bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
536 bi_instr *I = NULL;
537
538 if (b->shader->malloc_idvs && immediate) {
539 /* Immediate index given in bytes. */
540 bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
541 update, vecsize,
542 bi_varying_offset(b->shader, instr));
543 } else if (immediate && smooth) {
544 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
545 imm_index);
546 } else if (immediate && !smooth) {
547 I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
548 imm_index);
549 } else {
550 bi_index idx = bi_src_index(offset);
551 unsigned base = nir_intrinsic_base(instr);
552
553 if (b->shader->malloc_idvs) {
554 /* Index needs to be in bytes, but NIR gives the index
555 * in slots. For now assume 16 bytes per element.
556 */
557 bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
558 unsigned vbase = bi_varying_base_bytes(b->shader, instr);
559
560 if (vbase != 0)
561 idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
562
563 bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
564 source_format, update, vecsize);
565 } else if (smooth) {
566 if (base != 0)
567 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
568
569 I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
570 } else {
571 if (base != 0)
572 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
573
574 I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
575 }
576 }
577
578 /* Valhall usually uses machine-allocated IDVS. If this is disabled, use
579 * a simple Midgard-style ABI.
580 */
581 if (b->shader->arch >= 9 && I != NULL)
582 I->table = PAN_TABLE_ATTRIBUTE;
583
584 bi_copy_component(b, instr, dest);
585 }
586
587 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)588 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
589 unsigned count)
590 {
591 assert(1 <= count && count <= 4);
592
593 bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
594
595 for (unsigned i = 0; i < count; ++i) {
596 unsigned chan = channel ? channel[i] : 0;
597
598 bytes[i] = bi_byte(bi_extract(b, src[i], chan >> 2), chan & 3);
599 }
600
601 if (b->shader->arch >= 9) {
602 bi_index vec = bi_zero();
603
604 if (count >= 3)
605 vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
606
607 return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
608 } else {
609 return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
610 }
611 }
612
613 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)614 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
615 unsigned count)
616 {
617 unsigned chan0 = channel ? channel[0] : 0;
618 bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
619 bi_index h0 = bi_half(w0, chan0 & 1);
620
621 /* Zero extend */
622 if (count == 1)
623 return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
624
625 /* Else, create a vector */
626 assert(count == 2);
627
628 unsigned chan1 = channel ? channel[1] : 0;
629 bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
630 bi_index h1 = bi_half(w1, chan1 & 1);
631
632 if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
633 return bi_mov_i32(b, w0);
634 else if (bi_is_word_equiv(w0, w1))
635 return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
636 else
637 return bi_mkvec_v2i16(b, h0, h1);
638 }
639
640 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)641 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
642 unsigned count, unsigned bitsize)
643 {
644 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
645 unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
646 unsigned chan_per_word = 1 << shift;
647
648 assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
649 "unnecessarily large vector should have been lowered");
650
651 bi_index srcs[BI_MAX_VEC];
652
653 for (unsigned i = 0; i < count; i += chan_per_word) {
654 unsigned rem = MIN2(count - i, chan_per_word);
655 unsigned *channel_offset = channel ? (channel + i) : NULL;
656
657 if (bitsize == 32)
658 srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
659 else if (bitsize == 16)
660 srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
661 else
662 srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
663 }
664
665 bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
666 }
667
668 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)669 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
670 bi_index src1)
671 {
672 bi_instr *I;
673
674 if (b->shader->arch >= 9) {
675 I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
676 I->seg = BI_SEG_UBO;
677 } else {
678 I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
679 }
680
681 bi_emit_cached_split(b, dest0, bitsize);
682 return I;
683 }
684
685 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)686 bi_load_sample_id_to(bi_builder *b, bi_index dst)
687 {
688 /* r61[16:23] contains the sampleID, mask it out. Upper bits
689 * seem to read garbage (despite being architecturally defined
690 * as zero), so use a 5-bit mask instead of 8-bits */
691
692 bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
693 bi_imm_u8(16), false);
694 }
695
696 static bi_index
bi_load_sample_id(bi_builder * b)697 bi_load_sample_id(bi_builder *b)
698 {
699 bi_index sample_id = bi_temp(b->shader);
700 bi_load_sample_id_to(b, sample_id);
701 return sample_id;
702 }
703
704 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)705 bi_pixel_indices(bi_builder *b, unsigned rt)
706 {
707 /* We want to load the current pixel. */
708 struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
709
710 uint32_t indices_u32 = 0;
711 memcpy(&indices_u32, &pix, sizeof(indices_u32));
712 bi_index indices = bi_imm_u32(indices_u32);
713
714 /* Sample index above is left as zero. For multisampling, we need to
715 * fill in the actual sample ID in the lower byte */
716
717 if (b->shader->inputs->blend.nr_samples > 1)
718 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
719
720 return indices;
721 }
722
723 /* Source color is passed through r0-r3, or r4-r7 for the second source when
724 * dual-source blending. Preload the corresponding vector.
725 */
726 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)727 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
728 {
729 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
730 unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
731 unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
732 assert(size == 16 || size == 32);
733
734 bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
735 bi_preload(b, base + 2), bi_preload(b, base + 3)};
736
737 bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
738 }
739
740 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)741 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
742 nir_alu_type T2, unsigned rt)
743 {
744 /* Reads 2 or 4 staging registers to cover the input */
745 unsigned size = nir_alu_type_get_type_size(T);
746 unsigned size_2 = nir_alu_type_get_type_size(T2);
747 unsigned sr_count = (size <= 16) ? 2 : 4;
748 unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
749 const struct panfrost_compile_inputs *inputs = b->shader->inputs;
750 uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
751 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
752
753 /* Workaround for NIR-to-TGSI */
754 if (b->shader->nir->info.fs.untyped_color_outputs)
755 regfmt = BI_REGISTER_FORMAT_AUTO;
756
757 if (inputs->is_blend && inputs->blend.nr_samples > 1) {
758 /* Conversion descriptor comes from the compile inputs, pixel
759 * indices derived at run time based on sample ID */
760 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
761 bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
762 } else if (b->shader->inputs->is_blend) {
763 uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
764
765 /* Blend descriptor comes from the compile inputs */
766 /* Put the result in r0 */
767
768 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
769 bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
770 bi_null(), regfmt, sr_count, 0);
771 } else {
772 /* Blend descriptor comes from the FAU RAM. By convention, the
773 * return address on Bifrost is stored in r48 and will be used
774 * by the blend shader to jump back to the fragment shader */
775
776 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
777 bi_fau(BIR_FAU_BLEND_0 + rt, false),
778 bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
779 sr_count_2);
780 }
781
782 assert(rt < 8);
783 b->shader->info.bifrost->blend[rt].type = T;
784
785 if (T2)
786 b->shader->info.bifrost->blend_src1_type = T2;
787 }
788
789 /* Blend shaders do not need to run ATEST since they are dependent on a
790 * fragment shader that runs it. Blit shaders may not need to run ATEST, since
791 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
792 * there are no writes to the coverage mask. The latter two are satisfied for
793 * all blit shaders, so we just care about early-z, which blit shaders force
794 * iff they do not write depth or stencil */
795
796 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)797 bi_skip_atest(bi_context *ctx, bool emit_zs)
798 {
799 return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
800 }
801
802 static void
bi_emit_atest(bi_builder * b,bi_index alpha)803 bi_emit_atest(bi_builder *b, bi_index alpha)
804 {
805 b->shader->coverage =
806 bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
807 b->shader->emitted_atest = true;
808 }
809
810 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)811 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
812 {
813 bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
814
815 unsigned writeout =
816 combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
817
818 bool emit_blend = writeout & (PAN_WRITEOUT_C);
819 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
820
821 unsigned loc = nir_intrinsic_io_semantics(instr).location;
822 bi_index src0 = bi_src_index(&instr->src[0]);
823
824 /* By ISA convention, the coverage mask is stored in R60. The store
825 * itself will be handled by a subsequent ATEST instruction */
826 if (loc == FRAG_RESULT_SAMPLE_MASK) {
827 b->shader->coverage = bi_extract(b, src0, 0);
828 return;
829 }
830
831 /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
832 * value, but render target #0 might not be floating point. However the
833 * alpha value is only used for alpha-to-coverage, a stage which is
834 * skipped for pure integer framebuffers, so the issue is moot. */
835
836 if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
837 nir_alu_type T = nir_intrinsic_src_type(instr);
838
839 bi_index rgba = bi_src_index(&instr->src[0]);
840 bi_index alpha = (T == nir_type_float16)
841 ? bi_half(bi_extract(b, rgba, 1), true)
842 : (T == nir_type_float32) ? bi_extract(b, rgba, 3)
843 : bi_dontcare(b);
844
845 /* Don't read out-of-bounds */
846 if (nir_src_num_components(instr->src[0]) < 4)
847 alpha = bi_imm_f32(1.0);
848
849 bi_emit_atest(b, alpha);
850 }
851
852 if (emit_zs) {
853 bi_index z = bi_dontcare(b), s = bi_dontcare(b);
854
855 if (writeout & PAN_WRITEOUT_Z)
856 z = bi_src_index(&instr->src[2]);
857
858 if (writeout & PAN_WRITEOUT_S)
859 s = bi_src_index(&instr->src[3]);
860
861 b->shader->coverage =
862 bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
863 writeout & PAN_WRITEOUT_Z);
864 }
865
866 if (emit_blend) {
867 unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
868 bool dual = (writeout & PAN_WRITEOUT_2);
869 bi_index color = bi_src_index(&instr->src[0]);
870 bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null();
871 nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
872
873 /* Explicit copy since BLEND inputs are precoloured to R0-R3,
874 * TODO: maybe schedule around this or implement in RA as a
875 * spill */
876 bool has_mrt =
877 (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
878
879 if (has_mrt) {
880 bi_index srcs[4] = {color, color, color, color};
881 unsigned channels[4] = {0, 1, 2, 3};
882 color = bi_temp(b->shader);
883 bi_make_vec_to(
884 b, color, srcs, channels, nir_src_num_components(instr->src[0]),
885 nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
886 }
887
888 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
889 }
890
891 if (b->shader->inputs->is_blend) {
892 /* Jump back to the fragment shader, return address is stored
893 * in r48 (see above). On Valhall, only jump if the address is
894 * nonzero. The check is free there and it implements the "jump
895 * to 0 terminates the blend shader" that's automatic on
896 * Bifrost.
897 */
898 if (b->shader->arch >= 8)
899 bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
900 else
901 bi_jump(b, bi_preload(b, 48));
902 }
903 }
904
905 /**
906 * In a vertex shader, is the specified variable a position output? These kinds
907 * of outputs are written from position shaders when IDVS is enabled. All other
908 * outputs are written from the varying shader.
909 */
910 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)911 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
912 {
913 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
914
915 switch (sem.location) {
916 case VARYING_SLOT_POS:
917 case VARYING_SLOT_PSIZ:
918 return idvs == BI_IDVS_VARYING;
919 default:
920 return idvs == BI_IDVS_POSITION;
921 }
922 }
923
924 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)925 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
926 {
927 enum bi_idvs_mode *idvs = data;
928
929 if (instr->type != nir_instr_type_intrinsic)
930 return false;
931
932 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
933
934 if (intr->intrinsic != nir_intrinsic_store_output)
935 return false;
936
937 if (bi_should_remove_store(intr, *idvs)) {
938 nir_instr_remove(instr);
939 return true;
940 }
941
942 return false;
943 }
944
945 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)946 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
947 {
948 /* In principle we can do better for 16-bit. At the moment we require
949 * 32-bit to permit the use of .auto, in order to force .u32 for flat
950 * varyings, to handle internal TGSI shaders that set flat in the VS
951 * but smooth in the FS */
952
953 ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
954 ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
955 assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
956 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
957
958 unsigned imm_index = 0;
959 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
960
961 /* Only look at the total components needed. In effect, we fill in all
962 * the intermediate "holes" in the write mask, since we can't mask off
963 * stores. Since nir_lower_io_to_temporaries ensures each varying is
964 * written at most once, anything that's masked out is undefined, so it
965 * doesn't matter what we write there. So we may as well do the
966 * simplest thing possible. */
967 unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
968 assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
969
970 bi_index data = bi_src_index(&instr->src[0]);
971
972 /* To keep the vector dimensions consistent, we need to drop some
973 * components. This should be coalesced.
974 *
975 * TODO: This is ugly and maybe inefficient. Would we rather
976 * introduce a TRIM.i32 pseudoinstruction?
977 */
978 if (nr < nir_intrinsic_src_components(instr, 0)) {
979 assert(T_size == 32 && "todo: 16-bit trim");
980
981 bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
982 unsigned src_comps = nir_intrinsic_src_components(instr, 0);
983
984 bi_emit_split_i32(b, chans, data, src_comps);
985
986 bi_index tmp = bi_temp(b->shader);
987 bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
988
989 bi_foreach_src(collect, w)
990 collect->src[w] = chans[w];
991
992 data = tmp;
993 }
994
995 bool psiz =
996 (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
997
998 bi_index a[4] = {bi_null()};
999
1000 if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1001 /* Bifrost position shaders have a fast path */
1002 assert(T == nir_type_float16 || T == nir_type_float32);
1003 unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1004 unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1005 unsigned snap4 = 0x5E;
1006 uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1007
1008 bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1009 bi_imm_u32(format), regfmt, nr - 1);
1010 } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1011 bi_index index = bi_preload(b, 59);
1012
1013 if (psiz) {
1014 assert(T_size == 16 && "should've been lowered");
1015 index = bi_iadd_imm_i32(b, index, 4);
1016 }
1017
1018 bi_index address = bi_lea_buf_imm(b, index);
1019 bi_emit_split_i32(b, a, address, 2);
1020
1021 bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1022
1023 bi_store(b, nr * nir_src_bit_size(instr->src[0]), data, a[0], a[1],
1024 varying ? BI_SEG_VARY : BI_SEG_POS,
1025 varying ? bi_varying_offset(b->shader, instr) : 0);
1026 } else if (immediate) {
1027 bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1028 regfmt, imm_index);
1029 bi_emit_split_i32(b, a, address, 3);
1030
1031 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1032 } else {
1033 bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1034 bi_imm_u32(nir_intrinsic_base(instr)), false);
1035 bi_index address =
1036 bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1037 bi_emit_split_i32(b, a, address, 3);
1038
1039 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1040 }
1041 }
1042
1043 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1044 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1045 {
1046 nir_src *offset = nir_get_io_offset_src(instr);
1047
1048 bool offset_is_const = nir_src_is_const(*offset);
1049 bi_index dyn_offset = bi_src_index(offset);
1050 uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1051
1052 bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1053 bi_def_index(&instr->def),
1054 offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1055 bi_src_index(&instr->src[0]));
1056 }
1057
1058 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1059 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1060 {
1061 assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1062
1063 nir_src *offset = &instr->src[0];
1064 assert(nir_src_is_const(*offset) && "no indirect push constants");
1065 uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1066 assert((base & 3) == 0 && "unaligned push constants");
1067
1068 unsigned bits = instr->def.bit_size * instr->def.num_components;
1069
1070 unsigned n = DIV_ROUND_UP(bits, 32);
1071 assert(n <= 4);
1072 bi_index channels[4] = {bi_null()};
1073
1074 for (unsigned i = 0; i < n; ++i) {
1075 unsigned word = (base >> 2) + i;
1076
1077 channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1078 }
1079
1080 bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1081 }
1082
1083 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1084 bi_addr_high(bi_builder *b, nir_src *src)
1085 {
1086 return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1087 : bi_zero();
1088 }
1089
1090 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1091 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1092 enum bi_seg seg, int16_t *offset)
1093 {
1094 /* Not needed on Bifrost or for global accesses */
1095 if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1096 return;
1097
1098 /* There is no segment modifier on Valhall. Instead, we need to
1099 * emit the arithmetic ourselves. We do have an offset
1100 * available, which saves an instruction for constant offsets.
1101 */
1102 bool wls = (seg == BI_SEG_WLS);
1103 assert(wls || (seg == BI_SEG_TL));
1104
1105 enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1106
1107 bi_index base_lo = bi_fau(fau, false);
1108
1109 if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1110 addr_lo->value == (int16_t)addr_lo->value) {
1111 *offset = addr_lo->value;
1112 *addr_lo = base_lo;
1113 } else {
1114 *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1115 }
1116
1117 /* Do not allow overflow for WLS or TLS */
1118 *addr_hi = bi_fau(fau, true);
1119 }
1120
1121 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1122 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1123 {
1124 int16_t offset = 0;
1125 unsigned bits = instr->num_components * instr->def.bit_size;
1126 bi_index dest = bi_def_index(&instr->def);
1127 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1128 bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1129
1130 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1131
1132 bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1133 bi_emit_cached_split(b, dest, bits);
1134 }
1135
1136 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1137 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1138 {
1139 /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1140 assert(nir_intrinsic_write_mask(instr) ==
1141 BITFIELD_MASK(instr->num_components));
1142
1143 int16_t offset = 0;
1144 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1145 bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1146
1147 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1148
1149 bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1150 bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1151 }
1152
1153 /* Exchanges the staging register with memory */
1154
1155 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1156 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1157 enum bi_seg seg)
1158 {
1159 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1160
1161 unsigned sz = nir_src_bit_size(*arg);
1162 assert(sz == 32 || sz == 64);
1163
1164 bi_index data = bi_src_index(arg);
1165
1166 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1167
1168 if (b->shader->arch >= 9)
1169 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1170 else if (seg == BI_SEG_WLS)
1171 addr_hi = bi_zero();
1172
1173 bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1174 }
1175
1176 /* Exchanges the second staging register with memory if comparison with first
1177 * staging register passes */
1178
1179 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1180 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1181 nir_src *arg_2, enum bi_seg seg)
1182 {
1183 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1184
1185 /* hardware is swapped from NIR */
1186 bi_index src0 = bi_src_index(arg_2);
1187 bi_index src1 = bi_src_index(arg_1);
1188
1189 unsigned sz = nir_src_bit_size(*arg_1);
1190 assert(sz == 32 || sz == 64);
1191
1192 bi_index data_words[] = {
1193 bi_extract(b, src0, 0),
1194 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1195
1196 /* 64-bit */
1197 bi_extract(b, src1, 0),
1198 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1199 };
1200
1201 bi_index in = bi_temp(b->shader);
1202 bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1203 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1204
1205 if (b->shader->arch >= 9)
1206 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1207 else if (seg == BI_SEG_WLS)
1208 addr_hi = bi_zero();
1209
1210 bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1211 bi_emit_cached_split(b, out, sz);
1212
1213 bi_index inout_words[] = {bi_extract(b, out, 0),
1214 sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1215
1216 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1217 }
1218
1219 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1220 bi_atom_opc_for_nir(nir_atomic_op op)
1221 {
1222 /* clang-format off */
1223 switch (op) {
1224 case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1225 case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1226 case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1227 case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1228 case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1229 case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1230 case nir_atomic_op_ior: return BI_ATOM_OPC_AOR;
1231 case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1232 default: unreachable("Unexpected computational atomic");
1233 }
1234 /* clang-format on */
1235 }
1236
1237 /* Optimized unary atomics are available with an implied #1 argument */
1238
1239 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1240 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1241 {
1242 /* Check we have a compatible constant */
1243 if (arg.type != BI_INDEX_CONSTANT)
1244 return false;
1245
1246 if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1247 return false;
1248
1249 /* Check for a compatible operation */
1250 switch (op) {
1251 case BI_ATOM_OPC_AADD:
1252 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1253 return true;
1254 case BI_ATOM_OPC_ASMAX:
1255 *out = BI_ATOM_OPC_ASMAX1;
1256 return true;
1257 case BI_ATOM_OPC_AUMAX:
1258 *out = BI_ATOM_OPC_AUMAX1;
1259 return true;
1260 case BI_ATOM_OPC_AOR:
1261 *out = BI_ATOM_OPC_AOR1;
1262 return true;
1263 default:
1264 return false;
1265 }
1266 }
1267
1268 /*
1269 * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1270 * translate between these forms (with MKVEC.v2i16).
1271 *
1272 * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1273 * arrays. For uniform handling, we also treat 3D textures like 2D arrays.
1274 *
1275 * Our indexing needs to reflects this.
1276 */
1277 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array)1278 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1279 unsigned coord_comps, bool is_array)
1280 {
1281 assert(coord_comps > 0 && coord_comps <= 3);
1282
1283 if (src_idx == 0) {
1284 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1285 return bi_extract(b, coord, 0);
1286 else
1287 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1288 bi_half(bi_extract(b, coord, 1), false));
1289 } else {
1290 if (coord_comps == 3 && b->shader->arch >= 9)
1291 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1292 bi_half(bi_extract(b, coord, 2), false));
1293 else if (coord_comps == 2 && is_array && b->shader->arch >= 9)
1294 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1295 bi_half(bi_extract(b, coord, 1), false));
1296 else if (coord_comps == 3)
1297 return bi_extract(b, coord, 2);
1298 else if (coord_comps == 2 && is_array)
1299 return bi_extract(b, coord, 1);
1300 else
1301 return bi_zero();
1302 }
1303 }
1304
1305 static bi_index
bi_emit_image_index(bi_builder * b,nir_intrinsic_instr * instr)1306 bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
1307 {
1308 nir_src src = instr->src[0];
1309 bi_index index = bi_src_index(&src);
1310 bi_context *ctx = b->shader;
1311
1312 /* Images come after vertex attributes, so handle an explicit offset */
1313 unsigned offset = (ctx->stage == MESA_SHADER_VERTEX)
1314 ? util_bitcount64(ctx->nir->info.inputs_read)
1315 : 0;
1316
1317 if (offset == 0)
1318 return index;
1319 else if (nir_src_is_const(src))
1320 return bi_imm_u32(nir_src_as_uint(src) + offset);
1321 else
1322 return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
1323 }
1324
1325 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1326 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1327 {
1328 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1329 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1330 bool array = nir_intrinsic_image_array(instr);
1331
1332 bi_index coords = bi_src_index(&instr->src[1]);
1333 bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1334 bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1335 bi_index dest = bi_def_index(&instr->def);
1336 enum bi_register_format regfmt =
1337 bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1338 enum bi_vecsize vecsize = instr->num_components - 1;
1339
1340 assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd image not lowered");
1341
1342 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1343 bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize,
1344 nir_src_as_uint(instr->src[0]));
1345
1346 I->table = PAN_TABLE_IMAGE;
1347 } else if (b->shader->arch >= 9) {
1348 unreachable("Indirect images on Valhall not yet supported");
1349 } else {
1350 bi_ld_attr_tex_to(b, dest, xy, zw, bi_emit_image_index(b, instr), regfmt,
1351 vecsize);
1352 }
1353
1354 bi_split_def(b, &instr->def);
1355 }
1356
1357 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1358 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1359 {
1360 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1361 bool array = nir_intrinsic_image_array(instr);
1362 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1363
1364 assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd image not lowered");
1365
1366 enum bi_register_format type =
1367 (instr->intrinsic == nir_intrinsic_image_store)
1368 ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1369 : BI_REGISTER_FORMAT_AUTO;
1370
1371 bi_index coords = bi_src_index(&instr->src[1]);
1372 bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1373 bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1374
1375 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1376 bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false,
1377 nir_src_as_uint(instr->src[0]));
1378
1379 I->table = PAN_TABLE_IMAGE;
1380 } else if (b->shader->arch >= 9) {
1381 unreachable("Indirect images on Valhall not yet supported");
1382 } else {
1383 bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1384 bi_emit_image_index(b, instr), type);
1385
1386 /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1387 * our ABI has all images in the primary attribute table
1388 */
1389 I->table = BI_TABLE_ATTRIBUTE_1;
1390 }
1391
1392 bi_emit_cached_split(b, dest, 3 * 32);
1393 }
1394
1395 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1396 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1397 {
1398 bi_index dest = bi_temp(b->shader);
1399 bi_emit_lea_image_to(b, dest, instr);
1400 return dest;
1401 }
1402
1403 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1404 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1405 {
1406 bi_index a[4] = {bi_null()};
1407 bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1408
1409 /* Due to SPIR-V limitations, the source type is not fully reliable: it
1410 * reports uint32 even for write_imagei. This causes an incorrect
1411 * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1412 * instead, which will match per the OpenCL spec. Of course this does
1413 * not work for 16-bit stores, but those are not available in OpenCL.
1414 */
1415 nir_alu_type T = nir_intrinsic_src_type(instr);
1416 assert(nir_alu_type_get_type_size(T) == 32);
1417
1418 bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1419 BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1420 }
1421
1422 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1423 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1424 nir_atomic_op op)
1425 {
1426 enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1427 enum bi_atom_opc post_opc = opc;
1428 bool bifrost = b->shader->arch <= 8;
1429
1430 /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1431 * take any vector but can still output in RETURN mode */
1432 bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1433 unsigned sr_count = bifrost ? 2 : 1;
1434
1435 /* Generate either ATOM or ATOM1 as required */
1436 if (bi_promote_atom_c1(opc, arg, &opc)) {
1437 bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1438 bi_extract(b, addr, 1), opc, sr_count);
1439 } else {
1440 bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1441 bi_extract(b, addr, 1), opc, sr_count);
1442 }
1443
1444 if (bifrost) {
1445 /* Post-process it */
1446 bi_emit_cached_split_i32(b, tmp_dest, 2);
1447 bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1448 bi_extract(b, tmp_dest, 1), post_opc);
1449 }
1450 }
1451
1452 static void
bi_emit_load_frag_coord_zw(bi_builder * b,bi_index dst,unsigned channel)1453 bi_emit_load_frag_coord_zw(bi_builder *b, bi_index dst, unsigned channel)
1454 {
1455 bi_ld_var_special_to(
1456 b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1457 BI_UPDATE_CLOBBER,
1458 (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1459 BI_VECSIZE_NONE);
1460 }
1461
1462 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1463 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1464 {
1465 bi_index dest = bi_def_index(&instr->def);
1466 nir_alu_type T = nir_intrinsic_dest_type(instr);
1467 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1468 unsigned size = instr->def.bit_size;
1469 unsigned nr = instr->num_components;
1470
1471 /* Get the render target */
1472 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1473 unsigned loc = sem.location;
1474 assert(loc >= FRAG_RESULT_DATA0);
1475 unsigned rt = (loc - FRAG_RESULT_DATA0);
1476
1477 bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1478 bi_src_index(&instr->src[0]), regfmt, nr - 1);
1479 bi_emit_cached_split(b, dest, size * nr);
1480 }
1481
1482 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1483 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1484 {
1485 bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1486 ? bi_def_index(&instr->def)
1487 : bi_null();
1488 gl_shader_stage stage = b->shader->stage;
1489
1490 switch (instr->intrinsic) {
1491 case nir_intrinsic_load_barycentric_pixel:
1492 case nir_intrinsic_load_barycentric_centroid:
1493 case nir_intrinsic_load_barycentric_sample:
1494 case nir_intrinsic_load_barycentric_at_sample:
1495 case nir_intrinsic_load_barycentric_at_offset:
1496 /* handled later via load_vary */
1497 break;
1498 case nir_intrinsic_load_interpolated_input:
1499 case nir_intrinsic_load_input:
1500 if (b->shader->inputs->is_blend)
1501 bi_emit_load_blend_input(b, instr);
1502 else if (stage == MESA_SHADER_FRAGMENT)
1503 bi_emit_load_vary(b, instr);
1504 else if (stage == MESA_SHADER_VERTEX)
1505 bi_emit_load_attr(b, instr);
1506 else
1507 unreachable("Unsupported shader stage");
1508 break;
1509
1510 case nir_intrinsic_store_output:
1511 if (stage == MESA_SHADER_FRAGMENT)
1512 bi_emit_fragment_out(b, instr);
1513 else if (stage == MESA_SHADER_VERTEX)
1514 bi_emit_store_vary(b, instr);
1515 else
1516 unreachable("Unsupported shader stage");
1517 break;
1518
1519 case nir_intrinsic_store_combined_output_pan:
1520 assert(stage == MESA_SHADER_FRAGMENT);
1521 bi_emit_fragment_out(b, instr);
1522 break;
1523
1524 case nir_intrinsic_load_ubo:
1525 bi_emit_load_ubo(b, instr);
1526 break;
1527
1528 case nir_intrinsic_load_push_constant:
1529 bi_emit_load_push_constant(b, instr);
1530 break;
1531
1532 case nir_intrinsic_load_global:
1533 case nir_intrinsic_load_global_constant:
1534 bi_emit_load(b, instr, BI_SEG_NONE);
1535 break;
1536
1537 case nir_intrinsic_store_global:
1538 bi_emit_store(b, instr, BI_SEG_NONE);
1539 break;
1540
1541 case nir_intrinsic_load_scratch:
1542 bi_emit_load(b, instr, BI_SEG_TL);
1543 break;
1544
1545 case nir_intrinsic_store_scratch:
1546 bi_emit_store(b, instr, BI_SEG_TL);
1547 break;
1548
1549 case nir_intrinsic_load_shared:
1550 bi_emit_load(b, instr, BI_SEG_WLS);
1551 break;
1552
1553 case nir_intrinsic_store_shared:
1554 bi_emit_store(b, instr, BI_SEG_WLS);
1555 break;
1556
1557 case nir_intrinsic_barrier:
1558 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
1559 assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1560 assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
1561 "todo: subgroup barriers (different divergence rules)");
1562 bi_barrier(b);
1563 }
1564 /* Blob doesn't seem to do anything for memory barriers, so no need to
1565 * check nir_intrinsic_memory_scope().
1566 */
1567 break;
1568
1569 case nir_intrinsic_shared_atomic: {
1570 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1571
1572 if (op == nir_atomic_op_xchg) {
1573 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1574 BI_SEG_WLS);
1575 } else {
1576 assert(nir_src_bit_size(instr->src[1]) == 32);
1577
1578 bi_index addr = bi_src_index(&instr->src[0]);
1579 bi_index addr_hi;
1580
1581 if (b->shader->arch >= 9) {
1582 bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1583 addr = bi_collect_v2i32(b, addr, addr_hi);
1584 } else {
1585 addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1586 bi_emit_cached_split(b, addr, 64);
1587 }
1588
1589 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1590 }
1591
1592 bi_split_def(b, &instr->def);
1593 break;
1594 }
1595
1596 case nir_intrinsic_global_atomic: {
1597 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1598
1599 if (op == nir_atomic_op_xchg) {
1600 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1601 BI_SEG_NONE);
1602 } else {
1603 assert(nir_src_bit_size(instr->src[1]) == 32);
1604
1605 bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1606 bi_src_index(&instr->src[1]), op);
1607 }
1608
1609 bi_split_def(b, &instr->def);
1610 break;
1611 }
1612
1613 case nir_intrinsic_image_texel_address:
1614 bi_emit_lea_image_to(b, dst, instr);
1615 break;
1616
1617 case nir_intrinsic_image_load:
1618 bi_emit_image_load(b, instr);
1619 break;
1620
1621 case nir_intrinsic_image_store:
1622 bi_emit_image_store(b, instr);
1623 break;
1624
1625 case nir_intrinsic_global_atomic_swap:
1626 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1627 &instr->src[2], BI_SEG_NONE);
1628 bi_split_def(b, &instr->def);
1629 break;
1630
1631 case nir_intrinsic_shared_atomic_swap:
1632 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1633 &instr->src[2], BI_SEG_WLS);
1634 bi_split_def(b, &instr->def);
1635 break;
1636
1637 case nir_intrinsic_load_pixel_coord:
1638 /* Vectorized load of the preloaded i16vec2 */
1639 bi_mov_i32_to(b, dst, bi_preload(b, 59));
1640 break;
1641
1642 case nir_intrinsic_load_frag_coord_zw:
1643 bi_emit_load_frag_coord_zw(b, dst, nir_intrinsic_component(instr));
1644 break;
1645
1646 case nir_intrinsic_load_converted_output_pan:
1647 bi_emit_ld_tile(b, instr);
1648 break;
1649
1650 case nir_intrinsic_discard_if:
1651 bi_discard_b32(b, bi_src_index(&instr->src[0]));
1652 break;
1653
1654 case nir_intrinsic_discard:
1655 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1656 break;
1657
1658 case nir_intrinsic_load_sample_positions_pan:
1659 bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1660 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1661 break;
1662
1663 case nir_intrinsic_load_sample_mask_in:
1664 /* r61[0:15] contains the coverage bitmap */
1665 bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1666 break;
1667
1668 case nir_intrinsic_load_sample_mask:
1669 bi_mov_i32_to(b, dst, bi_coverage(b));
1670 break;
1671
1672 case nir_intrinsic_load_sample_id:
1673 bi_load_sample_id_to(b, dst);
1674 break;
1675
1676 case nir_intrinsic_load_front_face:
1677 /* r58 == 0 means primitive is front facing */
1678 bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1679 BI_RESULT_TYPE_M1);
1680 break;
1681
1682 case nir_intrinsic_load_point_coord:
1683 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1684 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1685 BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1686 bi_emit_cached_split_i32(b, dst, 2);
1687 break;
1688
1689 /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1690 * not with Valhall's memory-allocation IDVS geometry flow. We only support
1691 * the new flow on Valhall so this is lowered in NIR.
1692 */
1693 case nir_intrinsic_load_vertex_id:
1694 case nir_intrinsic_load_vertex_id_zero_base:
1695 assert(b->shader->malloc_idvs ==
1696 (instr->intrinsic == nir_intrinsic_load_vertex_id));
1697
1698 bi_mov_i32_to(b, dst, bi_vertex_id(b));
1699 break;
1700
1701 case nir_intrinsic_load_instance_id:
1702 bi_mov_i32_to(b, dst, bi_instance_id(b));
1703 break;
1704
1705 case nir_intrinsic_load_subgroup_invocation:
1706 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1707 break;
1708
1709 case nir_intrinsic_load_local_invocation_id:
1710 bi_collect_v3i32_to(b, dst,
1711 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1712 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1713 bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1714 break;
1715
1716 case nir_intrinsic_load_workgroup_id:
1717 bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1718 bi_preload(b, 59));
1719 break;
1720
1721 case nir_intrinsic_load_global_invocation_id:
1722 case nir_intrinsic_load_global_invocation_id_zero_base:
1723 bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1724 bi_preload(b, 62));
1725 break;
1726
1727 case nir_intrinsic_shader_clock:
1728 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1729 bi_split_def(b, &instr->def);
1730 break;
1731
1732 default:
1733 fprintf(stderr, "Unhandled intrinsic %s\n",
1734 nir_intrinsic_infos[instr->intrinsic].name);
1735 assert(0);
1736 }
1737 }
1738
1739 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)1740 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1741 {
1742 /* Make sure we've been lowered */
1743 assert(instr->def.num_components <= (32 / instr->def.bit_size));
1744
1745 /* Accumulate all the channels of the constant, as if we did an
1746 * implicit SEL over them */
1747 uint32_t acc = 0;
1748
1749 for (unsigned i = 0; i < instr->def.num_components; ++i) {
1750 unsigned v =
1751 nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1752 acc |= (v << (i * instr->def.bit_size));
1753 }
1754
1755 bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
1756 }
1757
1758 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)1759 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
1760 {
1761 unsigned bitsize = nir_src_bit_size(src.src);
1762
1763 /* the bi_index carries the 32-bit (word) offset separate from the
1764 * subword swizzle, first handle the offset */
1765
1766 unsigned offset = 0;
1767
1768 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1769 unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1770
1771 for (unsigned i = 0; i < comps; ++i) {
1772 unsigned new_offset = (src.swizzle[i] >> subword_shift);
1773
1774 if (i > 0)
1775 assert(offset == new_offset && "wrong vectorization");
1776
1777 offset = new_offset;
1778 }
1779
1780 bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
1781
1782 /* Compose the subword swizzle with existing (identity) swizzle */
1783 assert(idx.swizzle == BI_SWIZZLE_H01);
1784
1785 /* Bigger vectors should have been lowered */
1786 assert(comps <= (1 << subword_shift));
1787
1788 if (bitsize == 16) {
1789 unsigned c0 = src.swizzle[0] & 1;
1790 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1791 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1792 } else if (bitsize == 8) {
1793 /* 8-bit vectors not yet supported */
1794 assert(comps == 1 && "8-bit vectors not supported");
1795 idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
1796 }
1797
1798 return idx;
1799 }
1800
1801 static enum bi_round
bi_nir_round(nir_op op)1802 bi_nir_round(nir_op op)
1803 {
1804 switch (op) {
1805 case nir_op_fround_even:
1806 return BI_ROUND_NONE;
1807 case nir_op_ftrunc:
1808 return BI_ROUND_RTZ;
1809 case nir_op_fceil:
1810 return BI_ROUND_RTP;
1811 case nir_op_ffloor:
1812 return BI_ROUND_RTN;
1813 default:
1814 unreachable("invalid nir round op");
1815 }
1816 }
1817
1818 /* Convenience for lowered transcendentals */
1819
1820 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)1821 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
1822 {
1823 return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
1824 }
1825
1826 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
1827 * Newton-Raphson to improve precision */
1828
1829 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)1830 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
1831 {
1832 bi_index x1 = bi_frcp_approx_f32(b, s0);
1833 bi_index m = bi_frexpm_f32(b, s0, false, false);
1834 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
1835 bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
1836 BI_SPECIAL_N);
1837 bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
1838 }
1839
1840 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)1841 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
1842 {
1843 bi_index x1 = bi_frsq_approx_f32(b, s0);
1844 bi_index m = bi_frexpm_f32(b, s0, false, true);
1845 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
1846 bi_index t1 = bi_fmul_f32(b, x1, x1);
1847 bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
1848 bi_imm_u32(-1), BI_SPECIAL_N);
1849 bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
1850 }
1851
1852 /* More complex transcendentals, see
1853 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
1854 * for documentation */
1855
1856 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)1857 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
1858 {
1859 bi_index t1 = bi_temp(b->shader);
1860 bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
1861 t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
1862
1863 bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
1864
1865 bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
1866 a2->clamp = BI_CLAMP_CLAMP_M1_1;
1867
1868 bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
1869 bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
1870 bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
1871 bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
1872 bi_imm_u32(0x3e75fffa));
1873 bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
1874 bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
1875 bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
1876 BI_SPECIAL_NONE);
1877 x->clamp = BI_CLAMP_CLAMP_0_INF;
1878
1879 bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
1880 max->sem = BI_SEM_NAN_PROPAGATE;
1881 }
1882
1883 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)1884 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
1885 {
1886 /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
1887 * fixed-point input */
1888 bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
1889 bi_imm_u32(24), BI_SPECIAL_NONE);
1890 bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
1891 fixed_pt->round = BI_ROUND_NONE; // XXX
1892
1893 /* Compute the result for the fixed-point input, but pass along
1894 * the floating-point scale for correct NaN propagation */
1895 bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
1896 }
1897
1898 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1899 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1900 {
1901 /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
1902 bi_index a1 = bi_frexpm_f32(b, s0, true, false);
1903 bi_index ei = bi_frexpe_f32(b, s0, true, false);
1904 bi_index ef = bi_s32_to_f32(b, ei);
1905
1906 /* xt estimates -log(r1), a coarse approximation of log(a1) */
1907 bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
1908 bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
1909
1910 /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
1911 * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
1912 * and then log(s0) = x1 + x2 */
1913 bi_index x1 = bi_fadd_f32(b, ef, xt);
1914
1915 /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
1916 * polynomial approximation around 1. The series is expressed around
1917 * 1, so set y = (a1 * r1) - 1.0 */
1918 bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
1919
1920 /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
1921 * log_e(1 + y) by the Taylor series (lower precision than the blob):
1922 * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
1923 bi_index loge =
1924 bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
1925
1926 bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
1927
1928 /* log(s0) = x1 + x2 */
1929 bi_fadd_f32_to(b, dst, x1, x2);
1930 }
1931
1932 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1933 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1934 {
1935 bi_index frexp = bi_frexpe_f32(b, s0, true, false);
1936 bi_index frexpi = bi_s32_to_f32(b, frexp);
1937 bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
1938 bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
1939 }
1940
1941 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1942 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1943 {
1944 bi_index log2_base = bi_null();
1945
1946 if (base.type == BI_INDEX_CONSTANT) {
1947 log2_base = bi_imm_f32(log2f(uif(base.value)));
1948 } else {
1949 log2_base = bi_temp(b->shader);
1950 bi_lower_flog2_32(b, log2_base, base);
1951 }
1952
1953 return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
1954 }
1955
1956 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1957 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1958 {
1959 bi_index log2_base = bi_null();
1960
1961 if (base.type == BI_INDEX_CONSTANT) {
1962 log2_base = bi_imm_f32(log2f(uif(base.value)));
1963 } else {
1964 log2_base = bi_temp(b->shader);
1965 bi_flog2_32(b, log2_base, base);
1966 }
1967
1968 return bi_fexp_32(b, dst, exp, log2_base);
1969 }
1970
1971 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
1972 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
1973 * calculates the results. We use them to calculate sin/cos via a Taylor
1974 * approximation:
1975 *
1976 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
1977 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
1978 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
1979 */
1980
1981 #define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)
1982 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
1983 #define SINCOS_BIAS bi_imm_u32(0x49400000)
1984
1985 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)1986 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
1987 {
1988 /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
1989 bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
1990
1991 /* Approximate domain error (small) */
1992 bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
1993 MPI_OVER_TWO, s0);
1994
1995 /* Lookup sin(x), cos(x) */
1996 bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
1997 bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
1998
1999 /* e^2 / 2 */
2000 bi_index e2_over_2 =
2001 bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2002
2003 /* (-e^2)/2 f''(x) */
2004 bi_index quadratic =
2005 bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2006
2007 /* e f'(x) - (e^2/2) f''(x) */
2008 bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2009 cos ? bi_neg(sinx) : cosx, quadratic);
2010 I->clamp = BI_CLAMP_CLAMP_M1_1;
2011
2012 /* f(x) + e f'(x) - (e^2/2) f''(x) */
2013 bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2014 }
2015
2016 /*
2017 * The XOR lane op is useful for derivative calculations, but not all Bifrost
2018 * implementations have it. Add a safe helper that uses the hardware
2019 * functionality when available and lowers where unavailable.
2020 */
2021 static bi_index
bi_clper_xor(bi_builder * b,bi_index s0,bi_index s1)2022 bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
2023 {
2024 if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) {
2025 return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
2026 BI_SUBGROUP_SUBGROUP4);
2027 }
2028
2029 bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
2030 bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
2031 return bi_clper_old_i32(b, s0, lane);
2032 }
2033
2034 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2035 bi_translate_cmpf(nir_op op)
2036 {
2037 switch (op) {
2038 case nir_op_ieq8:
2039 case nir_op_ieq16:
2040 case nir_op_ieq32:
2041 case nir_op_feq16:
2042 case nir_op_feq32:
2043 return BI_CMPF_EQ;
2044
2045 case nir_op_ine8:
2046 case nir_op_ine16:
2047 case nir_op_ine32:
2048 case nir_op_fneu16:
2049 case nir_op_fneu32:
2050 return BI_CMPF_NE;
2051
2052 case nir_op_ilt8:
2053 case nir_op_ilt16:
2054 case nir_op_ilt32:
2055 case nir_op_flt16:
2056 case nir_op_flt32:
2057 case nir_op_ult8:
2058 case nir_op_ult16:
2059 case nir_op_ult32:
2060 return BI_CMPF_LT;
2061
2062 case nir_op_ige8:
2063 case nir_op_ige16:
2064 case nir_op_ige32:
2065 case nir_op_fge16:
2066 case nir_op_fge32:
2067 case nir_op_uge8:
2068 case nir_op_uge16:
2069 case nir_op_uge32:
2070 return BI_CMPF_GE;
2071
2072 default:
2073 unreachable("invalid comparison");
2074 }
2075 }
2076
2077 static bool
bi_nir_is_replicated(nir_alu_src * src)2078 bi_nir_is_replicated(nir_alu_src *src)
2079 {
2080 for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2081 if (src->swizzle[0] == src->swizzle[i])
2082 return false;
2083 }
2084
2085 return true;
2086 }
2087
2088 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2089 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2090 {
2091 bi_index dst = bi_def_index(&instr->def);
2092 unsigned srcs = nir_op_infos[instr->op].num_inputs;
2093 unsigned sz = instr->def.bit_size;
2094 unsigned comps = instr->def.num_components;
2095 unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2096
2097 /* Indicate scalarness */
2098 if (sz == 16 && comps == 1)
2099 dst.swizzle = BI_SWIZZLE_H00;
2100
2101 /* First, match against the various moves in NIR. These are
2102 * special-cased because they can operate on vectors even after
2103 * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2104 * instruction is no "bigger" than SIMD-within-a-register. These moves
2105 * are the exceptions that need to handle swizzles specially. */
2106
2107 switch (instr->op) {
2108 case nir_op_vec2:
2109 case nir_op_vec3:
2110 case nir_op_vec4:
2111 case nir_op_vec8:
2112 case nir_op_vec16: {
2113 bi_index unoffset_srcs[16] = {bi_null()};
2114 unsigned channels[16] = {0};
2115
2116 for (unsigned i = 0; i < srcs; ++i) {
2117 unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2118 channels[i] = instr->src[i].swizzle[0];
2119 }
2120
2121 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2122 return;
2123 }
2124
2125 case nir_op_unpack_32_2x16: {
2126 /* Should have been scalarized */
2127 assert(comps == 2 && sz == 16);
2128
2129 bi_index vec = bi_src_index(&instr->src[0].src);
2130 unsigned chan = instr->src[0].swizzle[0];
2131
2132 bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2133 return;
2134 }
2135
2136 case nir_op_unpack_64_2x32_split_x: {
2137 unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2138 bi_mov_i32_to(b, dst,
2139 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2140 return;
2141 }
2142
2143 case nir_op_unpack_64_2x32_split_y: {
2144 unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2145 bi_mov_i32_to(b, dst,
2146 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2147 return;
2148 }
2149
2150 case nir_op_pack_64_2x32_split:
2151 bi_collect_v2i32_to(b, dst,
2152 bi_extract(b, bi_src_index(&instr->src[0].src),
2153 instr->src[0].swizzle[0]),
2154 bi_extract(b, bi_src_index(&instr->src[1].src),
2155 instr->src[1].swizzle[0]));
2156 return;
2157
2158 case nir_op_pack_64_2x32:
2159 bi_collect_v2i32_to(b, dst,
2160 bi_extract(b, bi_src_index(&instr->src[0].src),
2161 instr->src[0].swizzle[0]),
2162 bi_extract(b, bi_src_index(&instr->src[0].src),
2163 instr->src[0].swizzle[1]));
2164 return;
2165
2166 case nir_op_pack_uvec2_to_uint: {
2167 bi_index src = bi_src_index(&instr->src[0].src);
2168
2169 assert(sz == 32 && src_sz == 32);
2170 bi_mkvec_v2i16_to(
2171 b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2172 bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2173 return;
2174 }
2175
2176 case nir_op_pack_uvec4_to_uint: {
2177 bi_index src = bi_src_index(&instr->src[0].src);
2178
2179 assert(sz == 32 && src_sz == 32);
2180 bi_mkvec_v4i8_to(
2181 b, dst, bi_byte(bi_extract(b, src, instr->src[0].swizzle[0]), 0),
2182 bi_byte(bi_extract(b, src, instr->src[0].swizzle[1]), 0),
2183 bi_byte(bi_extract(b, src, instr->src[0].swizzle[2]), 0),
2184 bi_byte(bi_extract(b, src, instr->src[0].swizzle[3]), 0));
2185 return;
2186 }
2187
2188 case nir_op_mov: {
2189 bi_index idx = bi_src_index(&instr->src[0].src);
2190 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2191
2192 unsigned channels[4] = {
2193 comps > 0 ? instr->src[0].swizzle[0] : 0,
2194 comps > 1 ? instr->src[0].swizzle[1] : 0,
2195 comps > 2 ? instr->src[0].swizzle[2] : 0,
2196 comps > 3 ? instr->src[0].swizzle[3] : 0,
2197 };
2198
2199 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2200 return;
2201 }
2202
2203 case nir_op_pack_32_2x16: {
2204 assert(comps == 1);
2205
2206 bi_index idx = bi_src_index(&instr->src[0].src);
2207 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2208
2209 unsigned channels[2] = {instr->src[0].swizzle[0],
2210 instr->src[0].swizzle[1]};
2211
2212 bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2213 return;
2214 }
2215
2216 case nir_op_f2f16:
2217 case nir_op_f2f16_rtz:
2218 case nir_op_f2f16_rtne: {
2219 assert(src_sz == 32);
2220 bi_index idx = bi_src_index(&instr->src[0].src);
2221 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2222 bi_index s1 =
2223 comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2224
2225 bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2226
2227 /* Override rounding if explicitly requested. Otherwise, the
2228 * default rounding mode is selected by the builder. Depending
2229 * on the float controls required by the shader, the default
2230 * mode may not be nearest-even.
2231 */
2232 if (instr->op == nir_op_f2f16_rtz)
2233 I->round = BI_ROUND_RTZ;
2234 else if (instr->op == nir_op_f2f16_rtne)
2235 I->round = BI_ROUND_NONE; /* Nearest even */
2236
2237 return;
2238 }
2239
2240 /* Vectorized downcasts */
2241 case nir_op_u2u16:
2242 case nir_op_i2i16: {
2243 if (!(src_sz == 32 && comps == 2))
2244 break;
2245
2246 bi_index idx = bi_src_index(&instr->src[0].src);
2247 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2248 bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2249
2250 bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2251 return;
2252 }
2253
2254 /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2255 * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2256 * scalarizing due to scheduling (equal cost on Valhall). Additionally
2257 * if the source is replicated the MKVEC.v2i16 can be optimized out.
2258 */
2259 case nir_op_u2f16:
2260 case nir_op_i2f16: {
2261 if (!(src_sz == 32 && comps == 2))
2262 break;
2263
2264 nir_alu_src *src = &instr->src[0];
2265 bi_index idx = bi_src_index(&src->src);
2266 bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2267 bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2268
2269 bi_index t =
2270 (src->swizzle[0] == src->swizzle[1])
2271 ? bi_half(s0, false)
2272 : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));
2273
2274 if (instr->op == nir_op_u2f16)
2275 bi_v2u16_to_v2f16_to(b, dst, t);
2276 else
2277 bi_v2s16_to_v2f16_to(b, dst, t);
2278
2279 return;
2280 }
2281
2282 case nir_op_i2i8:
2283 case nir_op_u2u8: {
2284 /* Acts like an 8-bit swizzle */
2285 bi_index idx = bi_src_index(&instr->src[0].src);
2286 unsigned factor = src_sz / 8;
2287 unsigned chan[4] = {0};
2288
2289 for (unsigned i = 0; i < comps; ++i)
2290 chan[i] = instr->src[0].swizzle[i] * factor;
2291
2292 bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2293 return;
2294 }
2295
2296 case nir_op_b32csel: {
2297 if (sz != 16)
2298 break;
2299
2300 /* We allow vectorizing b32csel(cond, A, B) which can be
2301 * translated as MUX.v2i16, even though cond is a 32-bit vector.
2302 *
2303 * If the source condition vector is replicated, we can use
2304 * MUX.v2i16 directly, letting each component use the
2305 * corresponding half of the 32-bit source. NIR uses 0/~0
2306 * booleans so that's guaranteed to work (that is, 32-bit NIR
2307 * booleans are 16-bit replicated).
2308 *
2309 * If we're not replicated, we use the same trick but must
2310 * insert a MKVEC.v2i16 first to convert down to 16-bit.
2311 */
2312 bi_index idx = bi_src_index(&instr->src[0].src);
2313 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2314 bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2315 bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2316
2317 if (!bi_nir_is_replicated(&instr->src[0])) {
2318 s0 = bi_mkvec_v2i16(
2319 b, bi_half(s0, false),
2320 bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2321 }
2322
2323 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2324 return;
2325 }
2326
2327 default:
2328 break;
2329 }
2330
2331 bi_index s0 =
2332 srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2333 bi_index s1 =
2334 srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2335 bi_index s2 =
2336 srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2337
2338 switch (instr->op) {
2339 case nir_op_ffma:
2340 bi_fma_to(b, sz, dst, s0, s1, s2);
2341 break;
2342
2343 case nir_op_fmul:
2344 bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2345 break;
2346
2347 case nir_op_fadd:
2348 bi_fadd_to(b, sz, dst, s0, s1);
2349 break;
2350
2351 case nir_op_fsat: {
2352 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2353 I->clamp = BI_CLAMP_CLAMP_0_1;
2354 break;
2355 }
2356
2357 case nir_op_fsat_signed_mali: {
2358 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2359 I->clamp = BI_CLAMP_CLAMP_M1_1;
2360 break;
2361 }
2362
2363 case nir_op_fclamp_pos_mali: {
2364 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2365 I->clamp = BI_CLAMP_CLAMP_0_INF;
2366 break;
2367 }
2368
2369 case nir_op_fneg:
2370 bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2371 break;
2372
2373 case nir_op_fabs:
2374 bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2375 break;
2376
2377 case nir_op_fsin:
2378 bi_lower_fsincos_32(b, dst, s0, false);
2379 break;
2380
2381 case nir_op_fcos:
2382 bi_lower_fsincos_32(b, dst, s0, true);
2383 break;
2384
2385 case nir_op_fexp2:
2386 assert(sz == 32); /* should've been lowered */
2387
2388 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2389 bi_lower_fexp2_32(b, dst, s0);
2390 else
2391 bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2392
2393 break;
2394
2395 case nir_op_flog2:
2396 assert(sz == 32); /* should've been lowered */
2397
2398 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2399 bi_lower_flog2_32(b, dst, s0);
2400 else
2401 bi_flog2_32(b, dst, s0);
2402
2403 break;
2404
2405 case nir_op_fpow:
2406 assert(sz == 32); /* should've been lowered */
2407
2408 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2409 bi_lower_fpow_32(b, dst, s0, s1);
2410 else
2411 bi_fpow_32(b, dst, s0, s1);
2412
2413 break;
2414
2415 case nir_op_frexp_exp:
2416 bi_frexpe_to(b, sz, dst, s0, false, false);
2417 break;
2418
2419 case nir_op_frexp_sig:
2420 bi_frexpm_to(b, sz, dst, s0, false, false);
2421 break;
2422
2423 case nir_op_ldexp:
2424 bi_ldexp_to(b, sz, dst, s0, s1);
2425 break;
2426
2427 case nir_op_b8csel:
2428 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2429 break;
2430
2431 case nir_op_b16csel:
2432 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2433 break;
2434
2435 case nir_op_b32csel:
2436 bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2437 break;
2438
2439 case nir_op_extract_u8:
2440 case nir_op_extract_i8: {
2441 assert(comps == 1 && "should be scalarized");
2442 assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2443 unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2444
2445 if (s0.swizzle == BI_SWIZZLE_H11) {
2446 assert(byte < 2);
2447 byte += 2;
2448 } else if (s0.swizzle != BI_SWIZZLE_H01) {
2449 assert(s0.swizzle == BI_SWIZZLE_H00);
2450 }
2451
2452 assert(byte < 4);
2453
2454 s0.swizzle = BI_SWIZZLE_H01;
2455
2456 if (instr->op == nir_op_extract_i8)
2457 bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2458 else
2459 bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2460 break;
2461 }
2462
2463 case nir_op_extract_u16:
2464 case nir_op_extract_i16: {
2465 assert(comps == 1 && "should be scalarized");
2466 assert(src_sz == 32 && "should be lowered");
2467 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2468 assert(half == 0 || half == 1);
2469
2470 if (instr->op == nir_op_extract_i16)
2471 bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2472 else
2473 bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2474 break;
2475 }
2476
2477 case nir_op_insert_u16: {
2478 assert(comps == 1 && "should be scalarized");
2479 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2480 assert(half == 0 || half == 1);
2481
2482 if (half == 0)
2483 bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2484 else
2485 bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2486 break;
2487 }
2488
2489 case nir_op_ishl:
2490 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2491 break;
2492 case nir_op_ushr:
2493 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2494 break;
2495
2496 case nir_op_ishr:
2497 if (b->shader->arch >= 9)
2498 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2499 else
2500 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2501 break;
2502
2503 case nir_op_imin:
2504 case nir_op_umin:
2505 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2506 s1, BI_CMPF_LT);
2507 break;
2508
2509 case nir_op_imax:
2510 case nir_op_umax:
2511 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2512 s1, BI_CMPF_GT);
2513 break;
2514
2515 case nir_op_fddx_must_abs_mali:
2516 case nir_op_fddy_must_abs_mali: {
2517 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2518 bi_index adjacent = bi_clper_xor(b, s0, bit);
2519 bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
2520 break;
2521 }
2522
2523 case nir_op_fddx:
2524 case nir_op_fddy:
2525 case nir_op_fddx_coarse:
2526 case nir_op_fddy_coarse:
2527 case nir_op_fddx_fine:
2528 case nir_op_fddy_fine: {
2529 unsigned axis;
2530 switch (instr->op) {
2531 case nir_op_fddx:
2532 case nir_op_fddx_coarse:
2533 case nir_op_fddx_fine:
2534 axis = 1;
2535 break;
2536 case nir_op_fddy:
2537 case nir_op_fddy_coarse:
2538 case nir_op_fddy_fine:
2539 axis = 2;
2540 break;
2541 default:
2542 unreachable("Invalid derivative op");
2543 }
2544
2545 bi_index lane1, lane2;
2546 switch (instr->op) {
2547 case nir_op_fddx:
2548 case nir_op_fddx_fine:
2549 case nir_op_fddy:
2550 case nir_op_fddy_fine:
2551 lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
2552 bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
2553
2554 lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
2555 break;
2556 case nir_op_fddx_coarse:
2557 case nir_op_fddy_coarse:
2558 lane1 = bi_imm_u32(0);
2559 lane2 = bi_imm_u32(axis);
2560 break;
2561 default:
2562 unreachable("Invalid derivative op");
2563 }
2564
2565 bi_index left, right;
2566
2567 if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2568 left = bi_clper_old_i32(b, s0, lane1);
2569 right = bi_clper_old_i32(b, s0, lane2);
2570 } else {
2571 left = bi_clper_i32(b, s0, lane1, BI_INACTIVE_RESULT_ZERO,
2572 BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4);
2573
2574 right = bi_clper_i32(b, s0, lane2, BI_INACTIVE_RESULT_ZERO,
2575 BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4);
2576 }
2577
2578 bi_fadd_to(b, sz, dst, right, bi_neg(left));
2579 break;
2580 }
2581
2582 case nir_op_f2f32:
2583 bi_f16_to_f32_to(b, dst, s0);
2584 break;
2585
2586 case nir_op_fquantize2f16: {
2587 bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2588 bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2589
2590 f16->ftz = f32->ftz = true;
2591 break;
2592 }
2593
2594 case nir_op_f2i32:
2595 if (src_sz == 32)
2596 bi_f32_to_s32_to(b, dst, s0);
2597 else
2598 bi_f16_to_s32_to(b, dst, s0);
2599 break;
2600
2601 /* Note 32-bit sources => no vectorization, so 32-bit works */
2602 case nir_op_f2u16:
2603 if (src_sz == 32)
2604 bi_f32_to_u32_to(b, dst, s0);
2605 else
2606 bi_v2f16_to_v2u16_to(b, dst, s0);
2607 break;
2608
2609 case nir_op_f2i16:
2610 if (src_sz == 32)
2611 bi_f32_to_s32_to(b, dst, s0);
2612 else
2613 bi_v2f16_to_v2s16_to(b, dst, s0);
2614 break;
2615
2616 case nir_op_f2u32:
2617 if (src_sz == 32)
2618 bi_f32_to_u32_to(b, dst, s0);
2619 else
2620 bi_f16_to_u32_to(b, dst, s0);
2621 break;
2622
2623 case nir_op_u2f16:
2624 if (src_sz == 32)
2625 bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2626 else if (src_sz == 16)
2627 bi_v2u16_to_v2f16_to(b, dst, s0);
2628 else if (src_sz == 8)
2629 bi_v2u8_to_v2f16_to(b, dst, s0);
2630 break;
2631
2632 case nir_op_u2f32:
2633 if (src_sz == 32)
2634 bi_u32_to_f32_to(b, dst, s0);
2635 else if (src_sz == 16)
2636 bi_u16_to_f32_to(b, dst, s0);
2637 else
2638 bi_u8_to_f32_to(b, dst, s0);
2639 break;
2640
2641 case nir_op_i2f16:
2642 if (src_sz == 32)
2643 bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2644 else if (src_sz == 16)
2645 bi_v2s16_to_v2f16_to(b, dst, s0);
2646 else if (src_sz == 8)
2647 bi_v2s8_to_v2f16_to(b, dst, s0);
2648 break;
2649
2650 case nir_op_i2f32:
2651 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2652
2653 if (src_sz == 32)
2654 bi_s32_to_f32_to(b, dst, s0);
2655 else if (src_sz == 16)
2656 bi_s16_to_f32_to(b, dst, s0);
2657 else if (src_sz == 8)
2658 bi_s8_to_f32_to(b, dst, s0);
2659 break;
2660
2661 case nir_op_i2i32:
2662 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2663
2664 if (src_sz == 32)
2665 bi_mov_i32_to(b, dst, s0);
2666 else if (src_sz == 16)
2667 bi_s16_to_s32_to(b, dst, s0);
2668 else if (src_sz == 8)
2669 bi_s8_to_s32_to(b, dst, s0);
2670 break;
2671
2672 case nir_op_u2u32:
2673 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2674
2675 if (src_sz == 32)
2676 bi_mov_i32_to(b, dst, s0);
2677 else if (src_sz == 16)
2678 bi_u16_to_u32_to(b, dst, s0);
2679 else if (src_sz == 8)
2680 bi_u8_to_u32_to(b, dst, s0);
2681
2682 break;
2683
2684 case nir_op_i2i16:
2685 assert(src_sz == 8 || src_sz == 32);
2686
2687 if (src_sz == 8)
2688 bi_v2s8_to_v2s16_to(b, dst, s0);
2689 else
2690 bi_mov_i32_to(b, dst, s0);
2691 break;
2692
2693 case nir_op_u2u16:
2694 assert(src_sz == 8 || src_sz == 32);
2695
2696 if (src_sz == 8)
2697 bi_v2u8_to_v2u16_to(b, dst, s0);
2698 else
2699 bi_mov_i32_to(b, dst, s0);
2700 break;
2701
2702 case nir_op_b2i8:
2703 case nir_op_b2i16:
2704 case nir_op_b2i32:
2705 bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
2706 BI_MUX_INT_ZERO);
2707 break;
2708
2709 case nir_op_ieq8:
2710 case nir_op_ine8:
2711 case nir_op_ilt8:
2712 case nir_op_ige8:
2713 case nir_op_ieq16:
2714 case nir_op_ine16:
2715 case nir_op_ilt16:
2716 case nir_op_ige16:
2717 case nir_op_ieq32:
2718 case nir_op_ine32:
2719 case nir_op_ilt32:
2720 case nir_op_ige32:
2721 bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2722 BI_RESULT_TYPE_M1);
2723 break;
2724
2725 case nir_op_ult8:
2726 case nir_op_uge8:
2727 case nir_op_ult16:
2728 case nir_op_uge16:
2729 case nir_op_ult32:
2730 case nir_op_uge32:
2731 bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
2732 bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2733 break;
2734
2735 case nir_op_feq32:
2736 case nir_op_feq16:
2737 case nir_op_flt32:
2738 case nir_op_flt16:
2739 case nir_op_fge32:
2740 case nir_op_fge16:
2741 case nir_op_fneu32:
2742 case nir_op_fneu16:
2743 bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2744 BI_RESULT_TYPE_M1);
2745 break;
2746
2747 case nir_op_fround_even:
2748 case nir_op_fceil:
2749 case nir_op_ffloor:
2750 case nir_op_ftrunc:
2751 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2752 break;
2753
2754 case nir_op_fmin:
2755 bi_fmin_to(b, sz, dst, s0, s1);
2756 break;
2757
2758 case nir_op_fmax:
2759 bi_fmax_to(b, sz, dst, s0, s1);
2760 break;
2761
2762 case nir_op_iadd:
2763 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2764 break;
2765
2766 case nir_op_iadd_sat:
2767 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2768 break;
2769
2770 case nir_op_uadd_sat:
2771 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2772 break;
2773
2774 case nir_op_ihadd:
2775 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2776 break;
2777
2778 case nir_op_irhadd:
2779 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2780 break;
2781
2782 case nir_op_uhadd:
2783 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
2784 break;
2785
2786 case nir_op_urhadd:
2787 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
2788 break;
2789
2790 case nir_op_ineg:
2791 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2792 break;
2793
2794 case nir_op_isub:
2795 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2796 break;
2797
2798 case nir_op_isub_sat:
2799 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2800 break;
2801
2802 case nir_op_usub_sat:
2803 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2804 break;
2805
2806 case nir_op_imul:
2807 bi_imul_to(b, sz, dst, s0, s1);
2808 break;
2809
2810 case nir_op_iabs:
2811 bi_iabs_to(b, sz, dst, s0);
2812 break;
2813
2814 case nir_op_iand:
2815 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2816 break;
2817
2818 case nir_op_ior:
2819 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2820 break;
2821
2822 case nir_op_ixor:
2823 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2824 break;
2825
2826 case nir_op_inot:
2827 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2828 break;
2829
2830 case nir_op_frsq:
2831 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2832 bi_lower_frsq_32(b, dst, s0);
2833 else
2834 bi_frsq_to(b, sz, dst, s0);
2835 break;
2836
2837 case nir_op_frcp:
2838 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2839 bi_lower_frcp_32(b, dst, s0);
2840 else
2841 bi_frcp_to(b, sz, dst, s0);
2842 break;
2843
2844 case nir_op_uclz:
2845 bi_clz_to(b, sz, dst, s0, false);
2846 break;
2847
2848 case nir_op_bit_count:
2849 assert(sz == 32 && src_sz == 32 && "should've been lowered");
2850 bi_popcount_i32_to(b, dst, s0);
2851 break;
2852
2853 case nir_op_bitfield_reverse:
2854 assert(sz == 32 && src_sz == 32 && "should've been lowered");
2855 bi_bitrev_i32_to(b, dst, s0);
2856 break;
2857
2858 case nir_op_ufind_msb: {
2859 bi_index clz = bi_clz(b, src_sz, s0, false);
2860
2861 if (sz == 8)
2862 clz = bi_byte(clz, 0);
2863 else if (sz == 16)
2864 clz = bi_half(clz, false);
2865
2866 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
2867 break;
2868 }
2869
2870 default:
2871 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2872 unreachable("Unknown ALU op");
2873 }
2874 }
2875
2876 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
2877 * Midgard */
2878 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)2879 bifrost_tex_format(enum glsl_sampler_dim dim)
2880 {
2881 switch (dim) {
2882 case GLSL_SAMPLER_DIM_1D:
2883 case GLSL_SAMPLER_DIM_BUF:
2884 return 1;
2885
2886 case GLSL_SAMPLER_DIM_2D:
2887 case GLSL_SAMPLER_DIM_MS:
2888 case GLSL_SAMPLER_DIM_EXTERNAL:
2889 case GLSL_SAMPLER_DIM_RECT:
2890 return 2;
2891
2892 case GLSL_SAMPLER_DIM_3D:
2893 return 3;
2894
2895 case GLSL_SAMPLER_DIM_CUBE:
2896 return 0;
2897
2898 default:
2899 DBG("Unknown sampler dim type\n");
2900 assert(0);
2901 return 0;
2902 }
2903 }
2904
2905 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)2906 valhall_tex_dimension(enum glsl_sampler_dim dim)
2907 {
2908 switch (dim) {
2909 case GLSL_SAMPLER_DIM_1D:
2910 case GLSL_SAMPLER_DIM_BUF:
2911 return BI_DIMENSION_1D;
2912
2913 case GLSL_SAMPLER_DIM_2D:
2914 case GLSL_SAMPLER_DIM_MS:
2915 case GLSL_SAMPLER_DIM_EXTERNAL:
2916 case GLSL_SAMPLER_DIM_RECT:
2917 return BI_DIMENSION_2D;
2918
2919 case GLSL_SAMPLER_DIM_3D:
2920 return BI_DIMENSION_3D;
2921
2922 case GLSL_SAMPLER_DIM_CUBE:
2923 return BI_DIMENSION_CUBE;
2924
2925 default:
2926 unreachable("Unknown sampler dim type");
2927 }
2928 }
2929
2930 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)2931 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
2932 {
2933 switch (T) {
2934 case nir_type_float16:
2935 return BIFROST_TEXTURE_FORMAT_F16 + clamp;
2936 case nir_type_float32:
2937 return BIFROST_TEXTURE_FORMAT_F32 + clamp;
2938 case nir_type_uint16:
2939 return BIFROST_TEXTURE_FORMAT_U16;
2940 case nir_type_int16:
2941 return BIFROST_TEXTURE_FORMAT_S16;
2942 case nir_type_uint32:
2943 return BIFROST_TEXTURE_FORMAT_U32;
2944 case nir_type_int32:
2945 return BIFROST_TEXTURE_FORMAT_S32;
2946 default:
2947 unreachable("Invalid type for texturing");
2948 }
2949 }
2950
2951 /* Array indices are specified as 32-bit uints, need to convert. In .z component
2952 * from NIR */
2953 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)2954 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
2955 {
2956 /* For (u)int we can just passthrough */
2957 nir_alu_type base = nir_alu_type_get_base_type(T);
2958 if (base == nir_type_int || base == nir_type_uint)
2959 return idx;
2960
2961 /* Otherwise we convert */
2962 assert(T == nir_type_float32);
2963
2964 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
2965 * Texel Selection") defines the layer to be taken from clamp(RNE(r),
2966 * 0, dt - 1). So we use round RTE, clamping is handled at the data
2967 * structure level */
2968
2969 bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
2970 I->round = BI_ROUND_NONE;
2971 return I->dest[0];
2972 }
2973
2974 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
2975 * 16-bit 8:8 fixed-point format. We lower as:
2976 *
2977 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
2978 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
2979 */
2980
2981 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)2982 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
2983 {
2984 /* Precompute for constant LODs to avoid general constant folding */
2985 if (lod.type == BI_INDEX_CONSTANT) {
2986 uint32_t raw = lod.value;
2987 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
2988 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
2989 return bi_imm_u32(s32 & 0xFFFF);
2990 }
2991
2992 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
2993 * the max LOD (16 since we cap at 2^16 texture dimensions), and
2994 * preferably small to minimize precision loss */
2995 const float max_lod = 16.0;
2996
2997 bi_instr *fsat =
2998 bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
2999 bi_imm_f32(1.0f / max_lod), bi_negzero());
3000
3001 fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3002
3003 bi_index fmul =
3004 bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3005
3006 return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3007 bi_imm_u16(0));
3008 }
3009
3010 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3011 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3012 * TODO: Cube face.
3013 */
3014
3015 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3016 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3017 {
3018 return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3019 }
3020
3021 /* The hardware specifies texel offsets and multisample indices together as a
3022 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3023 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3024 * the bits we need and return that to be passed as a staging register. Else we
3025 * return 0 to avoid allocating a data register when everything is zero. */
3026
3027 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3028 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3029 {
3030 bi_index dest = bi_zero();
3031
3032 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3033 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3034 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3035 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3036 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3037 dest = bi_mkvec_v4i8(
3038 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3039 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3040 (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3041 bi_imm_u8(0));
3042 }
3043
3044 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3045 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3046 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3047 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3048 bi_imm_u8(24));
3049 }
3050
3051 return dest;
3052 }
3053
3054 /*
3055 * Valhall specifies specifies texel offsets, multisample indices, and (for
3056 * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3057 * component is either offset.z or multisample index depending on context. Build
3058 * this register.
3059 */
3060 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3061 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3062 {
3063 bi_index dest = bi_zero();
3064
3065 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3066 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3067 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3068
3069 /* Components 0-2: offsets */
3070 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3071 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3072 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3073 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3074
3075 /* No multisample index with 3D */
3076 assert((nr <= 2) || (ms_idx < 0));
3077
3078 /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3079 bi_index z = (nr > 2)
3080 ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3081 bi_imm_u8(0), bi_zero())
3082 : bi_zero();
3083
3084 dest = bi_mkvec_v2i8(
3085 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3086 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3087 }
3088
3089 /* Component 2: multisample index */
3090 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3091 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3092 dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3093 }
3094
3095 /* Component 3: 8-bit LOD */
3096 if (lod_idx >= 0 &&
3097 (!nir_src_is_const(instr->src[lod_idx].src) ||
3098 nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3099 nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3100 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3101 bi_imm_u8(24));
3102 }
3103
3104 return dest;
3105 }
3106
3107 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3108 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3109 bi_index *t)
3110 {
3111 /* Compute max { |x|, |y|, |z| } */
3112 bi_index maxxyz = bi_temp(b->shader);
3113 *face = bi_temp(b->shader);
3114
3115 bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3116 cz = bi_extract(b, coord, 2);
3117
3118 /* Use a pseudo op on Bifrost due to tuple restrictions */
3119 if (b->shader->arch <= 8) {
3120 bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3121 } else {
3122 bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3123 bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3124 }
3125
3126 /* Select coordinates */
3127 bi_index ssel =
3128 bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3129 bi_index tsel =
3130 bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3131
3132 /* The OpenGL ES specification requires us to transform an input vector
3133 * (x, y, z) to the coordinate, given the selected S/T:
3134 *
3135 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3136 *
3137 * We implement (s shown, t similar) in a form friendlier to FMA
3138 * instructions, and clamp coordinates at the end for correct
3139 * NaN/infinity handling:
3140 *
3141 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3142 *
3143 * Take the reciprocal of max{x, y, z}
3144 */
3145 bi_index rcp = bi_frcp_f32(b, maxxyz);
3146
3147 /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3148 bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3149
3150 /* Transform the coordinates */
3151 *s = bi_temp(b->shader);
3152 *t = bi_temp(b->shader);
3153
3154 bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3155 bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3156
3157 S->clamp = BI_CLAMP_CLAMP_0_1;
3158 T->clamp = BI_CLAMP_CLAMP_0_1;
3159 }
3160
3161 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3162 * 32-bits in passed pointer t. The packing of the face with the S coordinate
3163 * exploits the redundancy of floating points with the range restriction of
3164 * CUBEFACE output.
3165 *
3166 * struct cube_map_descriptor {
3167 * float s : 29;
3168 * unsigned face : 3;
3169 * float t : 32;
3170 * }
3171 *
3172 * Since the cube face index is preshifted, this is easy to pack with a bitwise
3173 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3174 * bits from face.
3175 */
3176
3177 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3178 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3179 {
3180 bi_index face, s;
3181 bi_emit_cube_coord(b, coord, &face, &s, t);
3182 bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3183 return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3184 }
3185
3186 /* Map to the main texture op used. Some of these (txd in particular) will
3187 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3188 * sequence). We assume that lowering is handled elsewhere.
3189 */
3190
3191 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3192 bi_tex_op(nir_texop op)
3193 {
3194 switch (op) {
3195 case nir_texop_tex:
3196 case nir_texop_txb:
3197 case nir_texop_txl:
3198 case nir_texop_txd:
3199 return BIFROST_TEX_OP_TEX;
3200 case nir_texop_txf:
3201 case nir_texop_txf_ms:
3202 case nir_texop_tg4:
3203 return BIFROST_TEX_OP_FETCH;
3204 case nir_texop_txs:
3205 case nir_texop_lod:
3206 case nir_texop_query_levels:
3207 case nir_texop_texture_samples:
3208 case nir_texop_samples_identical:
3209 unreachable("should've been lowered");
3210 default:
3211 unreachable("unsupported tex op");
3212 }
3213 }
3214
3215 /* Data registers required by texturing in the order they appear. All are
3216 * optional, the texture operation descriptor determines which are present.
3217 * Note since 3D arrays are not permitted at an API level, Z_COORD and
3218 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3219
3220 enum bifrost_tex_dreg {
3221 BIFROST_TEX_DREG_Z_COORD = 0,
3222 BIFROST_TEX_DREG_Y_DELTAS = 1,
3223 BIFROST_TEX_DREG_LOD = 2,
3224 BIFROST_TEX_DREG_GRDESC_HI = 3,
3225 BIFROST_TEX_DREG_SHADOW = 4,
3226 BIFROST_TEX_DREG_ARRAY = 5,
3227 BIFROST_TEX_DREG_OFFSETMS = 6,
3228 BIFROST_TEX_DREG_SAMPLER = 7,
3229 BIFROST_TEX_DREG_TEXTURE = 8,
3230 BIFROST_TEX_DREG_COUNT,
3231 };
3232
3233 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3234 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3235 {
3236 struct bifrost_texture_operation desc = {
3237 .op = bi_tex_op(instr->op),
3238 .offset_or_bias_disable = false, /* TODO */
3239 .shadow_or_clamp_disable = instr->is_shadow,
3240 .array = instr->is_array,
3241 .dimension = bifrost_tex_format(instr->sampler_dim),
3242 .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3243 BI_CLAMP_NONE), /* TODO */
3244 .mask = 0xF,
3245 };
3246
3247 switch (desc.op) {
3248 case BIFROST_TEX_OP_TEX:
3249 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3250 break;
3251 case BIFROST_TEX_OP_FETCH:
3252 desc.lod_or_fetch = (enum bifrost_lod_mode)(
3253 instr->op == nir_texop_tg4
3254 ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3255 : BIFROST_TEXTURE_FETCH_TEXEL);
3256 break;
3257 default:
3258 unreachable("texture op unsupported");
3259 }
3260
3261 /* 32-bit indices to be allocated as consecutive staging registers */
3262 bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3263 bi_index cx = bi_null(), cy = bi_null();
3264
3265 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3266 bi_index index = bi_src_index(&instr->src[i].src);
3267 unsigned sz = nir_src_bit_size(instr->src[i].src);
3268 unsigned components = nir_src_num_components(instr->src[i].src);
3269 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3270 nir_alu_type T = base | sz;
3271
3272 switch (instr->src[i].src_type) {
3273 case nir_tex_src_coord:
3274 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3275 cx = bi_emit_texc_cube_coord(b, index, &cy);
3276 } else {
3277 /* Copy XY (for 2D+) or XX (for 1D) */
3278 cx = bi_extract(b, index, 0);
3279 cy = bi_extract(b, index, MIN2(1, components - 1));
3280
3281 assert(components >= 1 && components <= 3);
3282
3283 if (components == 3 && !desc.array) {
3284 /* 3D */
3285 dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3286 }
3287 }
3288
3289 if (desc.array) {
3290 dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3291 b, bi_extract(b, index, components - 1), T);
3292 }
3293
3294 break;
3295
3296 case nir_tex_src_lod:
3297 if (desc.op == BIFROST_TEX_OP_TEX &&
3298 nir_src_is_const(instr->src[i].src) &&
3299 nir_src_as_uint(instr->src[i].src) == 0) {
3300 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3301 } else if (desc.op == BIFROST_TEX_OP_TEX) {
3302 assert(base == nir_type_float);
3303
3304 assert(sz == 16 || sz == 32);
3305 dregs[BIFROST_TEX_DREG_LOD] =
3306 bi_emit_texc_lod_88(b, index, sz == 16);
3307 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3308 } else {
3309 assert(desc.op == BIFROST_TEX_OP_FETCH);
3310 assert(base == nir_type_uint || base == nir_type_int);
3311 assert(sz == 16 || sz == 32);
3312
3313 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3314 }
3315
3316 break;
3317
3318 case nir_tex_src_bias:
3319 /* Upper 16-bits interpreted as a clamp, leave zero */
3320 assert(desc.op == BIFROST_TEX_OP_TEX);
3321 assert(base == nir_type_float);
3322 assert(sz == 16 || sz == 32);
3323 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3324 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3325 break;
3326
3327 case nir_tex_src_ms_index:
3328 case nir_tex_src_offset:
3329 if (desc.offset_or_bias_disable)
3330 break;
3331
3332 dregs[BIFROST_TEX_DREG_OFFSETMS] =
3333 bi_emit_texc_offset_ms_index(b, instr);
3334 if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3335 desc.offset_or_bias_disable = true;
3336 break;
3337
3338 case nir_tex_src_comparator:
3339 dregs[BIFROST_TEX_DREG_SHADOW] = index;
3340 break;
3341
3342 case nir_tex_src_texture_offset:
3343 dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3344 break;
3345
3346 case nir_tex_src_sampler_offset:
3347 dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3348 break;
3349
3350 default:
3351 unreachable("Unhandled src type in texc emit");
3352 }
3353 }
3354
3355 if (desc.op == BIFROST_TEX_OP_FETCH &&
3356 bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3357 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3358 }
3359
3360 /* Choose an index mode */
3361
3362 bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3363 bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3364 bool direct = direct_tex && direct_samp;
3365
3366 desc.immediate_indices = direct && (instr->sampler_index < 16);
3367
3368 if (desc.immediate_indices) {
3369 desc.sampler_index_or_mode = instr->sampler_index;
3370 desc.index = instr->texture_index;
3371 } else {
3372 unsigned mode = 0;
3373
3374 if (direct && instr->sampler_index == instr->texture_index) {
3375 mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3376 desc.index = instr->texture_index;
3377 } else if (direct) {
3378 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3379 desc.index = instr->sampler_index;
3380 dregs[BIFROST_TEX_DREG_TEXTURE] =
3381 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3382 } else if (direct_tex) {
3383 assert(!direct_samp);
3384 mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3385 desc.index = instr->texture_index;
3386 } else if (direct_samp) {
3387 assert(!direct_tex);
3388 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3389 desc.index = instr->sampler_index;
3390 } else {
3391 mode = BIFROST_INDEX_REGISTER;
3392 }
3393
3394 mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3395 desc.sampler_index_or_mode = mode;
3396 }
3397
3398 /* Allocate staging registers contiguously by compacting the array. */
3399 unsigned sr_count = 0;
3400
3401 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3402 if (!bi_is_null(dregs[i]))
3403 dregs[sr_count++] = dregs[i];
3404 }
3405
3406 unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3407
3408 bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3409 bi_index dst = bi_temp(b->shader);
3410
3411 if (sr_count)
3412 bi_emit_collect_to(b, sr, dregs, sr_count);
3413
3414 uint32_t desc_u = 0;
3415 memcpy(&desc_u, &desc, sizeof(desc_u));
3416 bi_instr *I =
3417 bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u),
3418 !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3419 I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3420
3421 bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3422 bi_emit_split_i32(b, w, dst, res_size);
3423 bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3424 DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3425 }
3426
3427 /* Staging registers required by texturing in the order they appear (Valhall) */
3428
3429 enum valhall_tex_sreg {
3430 VALHALL_TEX_SREG_X_COORD = 0,
3431 VALHALL_TEX_SREG_Y_COORD = 1,
3432 VALHALL_TEX_SREG_Z_COORD = 2,
3433 VALHALL_TEX_SREG_Y_DELTAS = 3,
3434 VALHALL_TEX_SREG_ARRAY = 4,
3435 VALHALL_TEX_SREG_SHADOW = 5,
3436 VALHALL_TEX_SREG_OFFSETMS = 6,
3437 VALHALL_TEX_SREG_LOD = 7,
3438 VALHALL_TEX_SREG_GRDESC = 8,
3439 VALHALL_TEX_SREG_COUNT,
3440 };
3441
3442 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3443 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3444 {
3445 bool explicit_offset = false;
3446 enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3447
3448 bool has_lod_mode = (instr->op == nir_texop_tex) ||
3449 (instr->op == nir_texop_txl) ||
3450 (instr->op == nir_texop_txb);
3451
3452 /* 32-bit indices to be allocated as consecutive staging registers */
3453 bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3454 bi_index sampler = bi_imm_u32(instr->sampler_index);
3455 bi_index texture = bi_imm_u32(instr->texture_index);
3456 uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27);
3457
3458 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3459 bi_index index = bi_src_index(&instr->src[i].src);
3460 unsigned sz = nir_src_bit_size(instr->src[i].src);
3461 unsigned components = nir_src_num_components(instr->src[i].src);
3462
3463 switch (instr->src[i].src_type) {
3464 case nir_tex_src_coord:
3465 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3466 sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3467 b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3468 } else {
3469 assert(components >= 1 && components <= 3);
3470
3471 /* Copy XY (for 2D+) or XX (for 1D) */
3472 sregs[VALHALL_TEX_SREG_X_COORD] = index;
3473
3474 if (components >= 2)
3475 sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3476
3477 if (components == 3 && !instr->is_array) {
3478 sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3479 }
3480 }
3481
3482 if (instr->is_array) {
3483 sregs[VALHALL_TEX_SREG_ARRAY] =
3484 bi_extract(b, index, components - 1);
3485 }
3486
3487 break;
3488
3489 case nir_tex_src_lod:
3490 if (nir_src_is_const(instr->src[i].src) &&
3491 nir_src_as_uint(instr->src[i].src) == 0) {
3492 lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3493 } else if (has_lod_mode) {
3494 lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3495
3496 assert(sz == 16 || sz == 32);
3497 sregs[VALHALL_TEX_SREG_LOD] =
3498 bi_emit_texc_lod_88(b, index, sz == 16);
3499 }
3500 break;
3501
3502 case nir_tex_src_bias:
3503 /* Upper 16-bits interpreted as a clamp, leave zero */
3504 assert(sz == 16 || sz == 32);
3505 sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3506
3507 lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3508 break;
3509 case nir_tex_src_ms_index:
3510 case nir_tex_src_offset:
3511 /* Handled below */
3512 break;
3513
3514 case nir_tex_src_comparator:
3515 sregs[VALHALL_TEX_SREG_SHADOW] = index;
3516 break;
3517
3518 case nir_tex_src_texture_offset:
3519 texture = index;
3520 break;
3521
3522 case nir_tex_src_sampler_offset:
3523 sampler = index;
3524 break;
3525
3526 default:
3527 unreachable("Unhandled src type in tex emit");
3528 }
3529 }
3530
3531 /* Generate packed offset + ms index + LOD register. These default to
3532 * zero so we only need to encode if these features are actually in use.
3533 */
3534 bi_index offsets = bi_emit_valhall_offsets(b, instr);
3535
3536 if (!bi_is_equiv(offsets, bi_zero())) {
3537 sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3538 explicit_offset = true;
3539 }
3540
3541 /* Allocate staging registers contiguously by compacting the array. */
3542 unsigned sr_count = 0;
3543
3544 for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3545 if (!bi_is_null(sregs[i]))
3546 sregs[sr_count++] = sregs[i];
3547 }
3548
3549 bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3550
3551 if (sr_count)
3552 bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3553
3554 bi_index image_src = bi_imm_u32(tables);
3555 image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0));
3556 image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16));
3557
3558 /* Only write the components that we actually read */
3559 unsigned mask = nir_def_components_read(&instr->def);
3560 unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
3561 unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
3562
3563 enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3564 enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3565 bi_index dest = bi_temp(b->shader);
3566
3567 switch (instr->op) {
3568 case nir_texop_tex:
3569 case nir_texop_txl:
3570 case nir_texop_txb:
3571 bi_tex_single_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3572 regfmt, instr->is_shadow, explicit_offset, lod_mode,
3573 mask, sr_count);
3574 break;
3575 case nir_texop_txf:
3576 case nir_texop_txf_ms:
3577 bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3578 regfmt, explicit_offset, mask, sr_count);
3579 break;
3580 case nir_texop_tg4:
3581 bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim,
3582 instr->component, false, regfmt, instr->is_shadow,
3583 explicit_offset, mask, sr_count);
3584 break;
3585 default:
3586 unreachable("Unhandled Valhall texture op");
3587 }
3588
3589 /* The hardware will write only what we read, and it will into
3590 * contiguous registers without gaps (different from Bifrost). NIR
3591 * expects the gaps, so fill in the holes (they'll be copypropped and
3592 * DCE'd away later).
3593 */
3594 bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3595
3596 bi_emit_cached_split_i32(b, dest, res_size);
3597
3598 /* Index into the packed component array */
3599 unsigned j = 0;
3600 unsigned comps[4] = {0};
3601 unsigned nr_components = instr->def.num_components;
3602
3603 for (unsigned i = 0; i < nr_components; ++i) {
3604 if (mask & BITFIELD_BIT(i)) {
3605 unpacked[i] = dest;
3606 comps[i] = j++;
3607 } else {
3608 unpacked[i] = bi_zero();
3609 }
3610 }
3611
3612 bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
3613 instr->def.num_components, instr->def.bit_size);
3614 }
3615
3616 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3617 * textures with sufficiently small immediate indices. Anything else
3618 * needs a complete texture op. */
3619
3620 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)3621 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3622 {
3623 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3624 assert(coord_idx >= 0);
3625 bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3626
3627 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3628 bi_index face, s, t;
3629 bi_emit_cube_coord(b, coords, &face, &s, &t);
3630
3631 bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
3632 face, instr->sampler_index, instr->texture_index);
3633 } else {
3634 bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
3635 bi_extract(b, coords, 0), bi_extract(b, coords, 1),
3636 instr->op != nir_texop_tex, /* zero LOD */
3637 instr->sampler_index, instr->texture_index);
3638 }
3639
3640 bi_split_def(b, &instr->def);
3641 }
3642
3643 static bool
bi_is_simple_tex(nir_tex_instr * instr)3644 bi_is_simple_tex(nir_tex_instr *instr)
3645 {
3646 if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3647 return false;
3648
3649 if (instr->dest_type != nir_type_float32 &&
3650 instr->dest_type != nir_type_float16)
3651 return false;
3652
3653 if (instr->is_shadow || instr->is_array)
3654 return false;
3655
3656 switch (instr->sampler_dim) {
3657 case GLSL_SAMPLER_DIM_2D:
3658 case GLSL_SAMPLER_DIM_EXTERNAL:
3659 case GLSL_SAMPLER_DIM_RECT:
3660 break;
3661
3662 case GLSL_SAMPLER_DIM_CUBE:
3663 /* LOD can't be specified with TEXS_CUBE */
3664 if (instr->op == nir_texop_txl)
3665 return false;
3666 break;
3667
3668 default:
3669 return false;
3670 }
3671
3672 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3673 if (instr->src[i].src_type != nir_tex_src_lod &&
3674 instr->src[i].src_type != nir_tex_src_coord)
3675 return false;
3676 }
3677
3678 /* Indices need to fit in provided bits */
3679 unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3680 if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3681 return false;
3682
3683 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3684 if (lod_idx < 0)
3685 return true;
3686
3687 nir_src lod = instr->src[lod_idx].src;
3688 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3689 }
3690
3691 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)3692 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3693 {
3694 /* If txf is used, we assume there is a valid sampler bound at index 0. Use
3695 * it for txf operations, since there may be no other valid samplers. This is
3696 * a workaround: txf does not require a sampler in NIR (so sampler_index is
3697 * undefined) but we need one in the hardware. This is ABI with the driver.
3698 */
3699 if (!nir_tex_instr_need_sampler(instr))
3700 instr->sampler_index = 0;
3701
3702 if (b->shader->arch >= 9)
3703 bi_emit_tex_valhall(b, instr);
3704 else if (bi_is_simple_tex(instr))
3705 bi_emit_texs(b, instr);
3706 else
3707 bi_emit_texc(b, instr);
3708 }
3709
3710 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)3711 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
3712 {
3713 unsigned nr_srcs = exec_list_length(&instr->srcs);
3714 bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
3715
3716 /* Deferred */
3717 I->phi = instr;
3718 }
3719
3720 /* Look up the AGX block corresponding to a given NIR block. Used when
3721 * translating phi nodes after emitting all blocks.
3722 */
3723 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)3724 bi_from_nir_block(bi_context *ctx, nir_block *block)
3725 {
3726 return ctx->indexed_nir_blocks[block->index];
3727 }
3728
3729 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)3730 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
3731 {
3732 nir_phi_instr *phi = I->phi;
3733
3734 /* Guaranteed by lower_phis_to_scalar */
3735 assert(phi->def.num_components == 1);
3736
3737 nir_foreach_phi_src(src, phi) {
3738 bi_block *pred = bi_from_nir_block(ctx, src->pred);
3739 unsigned i = bi_predecessor_index(block, pred);
3740 assert(i < I->nr_srcs);
3741
3742 I->src[i] = bi_src_index(&src->src);
3743 }
3744
3745 I->phi = NULL;
3746 }
3747
3748 static void
bi_emit_phis_deferred(bi_context * ctx)3749 bi_emit_phis_deferred(bi_context *ctx)
3750 {
3751 bi_foreach_block(ctx, block) {
3752 bi_foreach_instr_in_block(block, I) {
3753 if (I->op == BI_OPCODE_PHI)
3754 bi_emit_phi_deferred(ctx, block, I);
3755 }
3756 }
3757 }
3758
3759 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)3760 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
3761 {
3762 switch (instr->type) {
3763 case nir_instr_type_load_const:
3764 bi_emit_load_const(b, nir_instr_as_load_const(instr));
3765 break;
3766
3767 case nir_instr_type_intrinsic:
3768 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
3769 break;
3770
3771 case nir_instr_type_alu:
3772 bi_emit_alu(b, nir_instr_as_alu(instr));
3773 break;
3774
3775 case nir_instr_type_tex:
3776 bi_emit_tex(b, nir_instr_as_tex(instr));
3777 break;
3778
3779 case nir_instr_type_jump:
3780 bi_emit_jump(b, nir_instr_as_jump(instr));
3781 break;
3782
3783 case nir_instr_type_phi:
3784 bi_emit_phi(b, nir_instr_as_phi(instr));
3785 break;
3786
3787 default:
3788 unreachable("should've been lowered");
3789 }
3790 }
3791
3792 static bi_block *
create_empty_block(bi_context * ctx)3793 create_empty_block(bi_context *ctx)
3794 {
3795 bi_block *blk = rzalloc(ctx, bi_block);
3796
3797 util_dynarray_init(&blk->predecessors, blk);
3798
3799 return blk;
3800 }
3801
3802 static bi_block *
emit_block(bi_context * ctx,nir_block * block)3803 emit_block(bi_context *ctx, nir_block *block)
3804 {
3805 if (ctx->after_block) {
3806 ctx->current_block = ctx->after_block;
3807 ctx->after_block = NULL;
3808 } else {
3809 ctx->current_block = create_empty_block(ctx);
3810 }
3811
3812 list_addtail(&ctx->current_block->link, &ctx->blocks);
3813 list_inithead(&ctx->current_block->instructions);
3814
3815 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3816
3817 ctx->indexed_nir_blocks[block->index] = ctx->current_block;
3818
3819 nir_foreach_instr(instr, block) {
3820 bi_emit_instr(&_b, instr);
3821 }
3822
3823 return ctx->current_block;
3824 }
3825
3826 static void
emit_if(bi_context * ctx,nir_if * nif)3827 emit_if(bi_context *ctx, nir_if *nif)
3828 {
3829 bi_block *before_block = ctx->current_block;
3830
3831 /* Speculatively emit the branch, but we can't fill it in until later */
3832 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3833 bi_instr *then_branch =
3834 bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
3835 bi_zero(), BI_CMPF_EQ);
3836
3837 /* Emit the two subblocks. */
3838 bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
3839 bi_block *end_then_block = ctx->current_block;
3840
3841 /* Emit second block */
3842
3843 bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
3844 bi_block *end_else_block = ctx->current_block;
3845 ctx->after_block = create_empty_block(ctx);
3846
3847 /* Now that we have the subblocks emitted, fix up the branches */
3848
3849 assert(then_block);
3850 assert(else_block);
3851
3852 then_branch->branch_target = else_block;
3853
3854 /* Emit a jump from the end of the then block to the end of the else */
3855 _b.cursor = bi_after_block(end_then_block);
3856 bi_instr *then_exit = bi_jump(&_b, bi_zero());
3857 then_exit->branch_target = ctx->after_block;
3858
3859 bi_block_add_successor(end_then_block, then_exit->branch_target);
3860 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
3861
3862 bi_block_add_successor(before_block,
3863 then_branch->branch_target); /* then_branch */
3864 bi_block_add_successor(before_block, then_block); /* fallthrough */
3865 }
3866
3867 static void
emit_loop(bi_context * ctx,nir_loop * nloop)3868 emit_loop(bi_context *ctx, nir_loop *nloop)
3869 {
3870 assert(!nir_loop_has_continue_construct(nloop));
3871
3872 /* Remember where we are */
3873 bi_block *start_block = ctx->current_block;
3874
3875 bi_block *saved_break = ctx->break_block;
3876 bi_block *saved_continue = ctx->continue_block;
3877
3878 ctx->continue_block = create_empty_block(ctx);
3879 ctx->break_block = create_empty_block(ctx);
3880 ctx->after_block = ctx->continue_block;
3881
3882 /* Emit the body itself */
3883 emit_cf_list(ctx, &nloop->body);
3884
3885 /* Branch back to loop back */
3886 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3887 bi_instr *I = bi_jump(&_b, bi_zero());
3888 I->branch_target = ctx->continue_block;
3889 bi_block_add_successor(start_block, ctx->continue_block);
3890 bi_block_add_successor(ctx->current_block, ctx->continue_block);
3891
3892 ctx->after_block = ctx->break_block;
3893
3894 /* Pop off */
3895 ctx->break_block = saved_break;
3896 ctx->continue_block = saved_continue;
3897 ++ctx->loop_count;
3898 }
3899
3900 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)3901 emit_cf_list(bi_context *ctx, struct exec_list *list)
3902 {
3903 bi_block *start_block = NULL;
3904
3905 foreach_list_typed(nir_cf_node, node, node, list) {
3906 switch (node->type) {
3907 case nir_cf_node_block: {
3908 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3909
3910 if (!start_block)
3911 start_block = block;
3912
3913 break;
3914 }
3915
3916 case nir_cf_node_if:
3917 emit_if(ctx, nir_cf_node_as_if(node));
3918 break;
3919
3920 case nir_cf_node_loop:
3921 emit_loop(ctx, nir_cf_node_as_loop(node));
3922 break;
3923
3924 default:
3925 unreachable("Unknown control flow");
3926 }
3927 }
3928
3929 return start_block;
3930 }
3931
3932 /* shader-db stuff */
3933
3934 struct bi_stats {
3935 unsigned nr_clauses, nr_tuples, nr_ins;
3936 unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
3937 };
3938
3939 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)3940 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
3941 {
3942 /* Count instructions */
3943 stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
3944
3945 /* Non-message passing tuples are always arithmetic */
3946 if (tuple->add != clause->message) {
3947 stats->nr_arith++;
3948 return;
3949 }
3950
3951 /* Message + FMA we'll count as arithmetic _and_ message */
3952 if (tuple->fma)
3953 stats->nr_arith++;
3954
3955 switch (clause->message_type) {
3956 case BIFROST_MESSAGE_VARYING:
3957 /* Check components interpolated */
3958 stats->nr_varying +=
3959 (clause->message->vecsize + 1) *
3960 (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
3961 break;
3962
3963 case BIFROST_MESSAGE_VARTEX:
3964 /* 2 coordinates, fp32 each */
3965 stats->nr_varying += (2 * 2);
3966 FALLTHROUGH;
3967 case BIFROST_MESSAGE_TEX:
3968 stats->nr_texture++;
3969 break;
3970
3971 case BIFROST_MESSAGE_ATTRIBUTE:
3972 case BIFROST_MESSAGE_LOAD:
3973 case BIFROST_MESSAGE_STORE:
3974 case BIFROST_MESSAGE_ATOMIC:
3975 stats->nr_ldst++;
3976 break;
3977
3978 case BIFROST_MESSAGE_NONE:
3979 case BIFROST_MESSAGE_BARRIER:
3980 case BIFROST_MESSAGE_BLEND:
3981 case BIFROST_MESSAGE_TILE:
3982 case BIFROST_MESSAGE_Z_STENCIL:
3983 case BIFROST_MESSAGE_ATEST:
3984 case BIFROST_MESSAGE_JOB:
3985 case BIFROST_MESSAGE_64BIT:
3986 /* Nothing to do */
3987 break;
3988 };
3989 }
3990
3991 /*
3992 * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
3993 * shader completes. These costs are not accounted for in the general cycle
3994 * counts, so this function calculates the effective cost of these messages, as
3995 * if they were executed by shader code.
3996 */
3997 static unsigned
bi_count_preload_cost(bi_context * ctx)3998 bi_count_preload_cost(bi_context *ctx)
3999 {
4000 /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4001 * 16 fp16 varying components per cycle or fetch two texels per cycle.
4002 */
4003 unsigned cost = 0;
4004
4005 for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4006 struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4007
4008 if (msg.enabled && msg.texture) {
4009 /* 2 coordinate, 2 half-words each, plus texture */
4010 cost += 12;
4011 } else if (msg.enabled) {
4012 cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4013 }
4014 }
4015
4016 return cost;
4017 }
4018
4019 static const char *
bi_shader_stage_name(bi_context * ctx)4020 bi_shader_stage_name(bi_context *ctx)
4021 {
4022 if (ctx->idvs == BI_IDVS_VARYING)
4023 return "MESA_SHADER_VARYING";
4024 else if (ctx->idvs == BI_IDVS_POSITION)
4025 return "MESA_SHADER_POSITION";
4026 else if (ctx->inputs->is_blend)
4027 return "MESA_SHADER_BLEND";
4028 else
4029 return gl_shader_stage_name(ctx->stage);
4030 }
4031
4032 static char *
bi_print_stats(bi_context * ctx,unsigned size)4033 bi_print_stats(bi_context *ctx, unsigned size)
4034 {
4035 struct bi_stats stats = {0};
4036
4037 /* Count instructions, clauses, and tuples. Also attempt to construct
4038 * normalized execution engine cycle counts, using the following ratio:
4039 *
4040 * 24 arith tuples/cycle
4041 * 2 texture messages/cycle
4042 * 16 x 16-bit varying channels interpolated/cycle
4043 * 1 load store message/cycle
4044 *
4045 * These numbers seem to match Arm Mobile Studio's heuristic. The real
4046 * cycle counts are surely more complicated.
4047 */
4048
4049 bi_foreach_block(ctx, block) {
4050 bi_foreach_clause_in_block(block, clause) {
4051 stats.nr_clauses++;
4052 stats.nr_tuples += clause->tuple_count;
4053
4054 for (unsigned i = 0; i < clause->tuple_count; ++i)
4055 bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4056 }
4057 }
4058
4059 float cycles_arith = ((float)stats.nr_arith) / 24.0;
4060 float cycles_texture = ((float)stats.nr_texture) / 2.0;
4061 float cycles_varying = ((float)stats.nr_varying) / 16.0;
4062 float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4063
4064 float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4065 float cycles_bound = MAX2(cycles_arith, cycles_message);
4066
4067 /* Thread count and register pressure are traded off only on v7 */
4068 bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4069 unsigned nr_threads = full_threads ? 2 : 1;
4070
4071 /* Dump stats */
4072 char *str = ralloc_asprintf(
4073 NULL,
4074 "%s shader: "
4075 "%u inst, %u tuples, %u clauses, "
4076 "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4077 "%u quadwords, %u threads",
4078 bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4079 stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4080 cycles_varying, cycles_ldst, size / 16, nr_threads);
4081
4082 if (ctx->arch == 7) {
4083 ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4084 }
4085
4086 ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4087 ctx->loop_count, ctx->spills, ctx->fills);
4088
4089 return str;
4090 }
4091
4092 static char *
va_print_stats(bi_context * ctx,unsigned size)4093 va_print_stats(bi_context *ctx, unsigned size)
4094 {
4095 unsigned nr_ins = 0;
4096 struct va_stats stats = {0};
4097
4098 /* Count instructions */
4099 bi_foreach_instr_global(ctx, I) {
4100 nr_ins++;
4101 va_count_instr_stats(I, &stats);
4102 }
4103
4104 /* Mali G78 peak performance:
4105 *
4106 * 64 FMA instructions per cycle
4107 * 64 CVT instructions per cycle
4108 * 16 SFU instructions per cycle
4109 * 8 x 32-bit varying channels interpolated per cycle
4110 * 4 texture instructions per cycle
4111 * 1 load/store operation per cycle
4112 */
4113
4114 float cycles_fma = ((float)stats.fma) / 64.0;
4115 float cycles_cvt = ((float)stats.cvt) / 64.0;
4116 float cycles_sfu = ((float)stats.sfu) / 16.0;
4117 float cycles_v = ((float)stats.v) / 16.0;
4118 float cycles_t = ((float)stats.t) / 4.0;
4119 float cycles_ls = ((float)stats.ls) / 1.0;
4120
4121 /* Calculate the bound */
4122 float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4123 MAX3(cycles_v, cycles_t, cycles_ls));
4124
4125 /* Thread count and register pressure are traded off */
4126 unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4127
4128 /* Dump stats */
4129 return ralloc_asprintf(NULL,
4130 "%s shader: "
4131 "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4132 "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4133 "%u:%u spills:fills",
4134 bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4135 cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4136 size / 16, nr_threads, ctx->loop_count, ctx->spills,
4137 ctx->fills);
4138 }
4139
4140 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4141 glsl_type_size(const struct glsl_type *type, bool bindless)
4142 {
4143 return glsl_count_attribute_slots(type, false);
4144 }
4145
4146 /* Split stores to memory. We don't split stores to vertex outputs, since
4147 * nir_lower_io_to_temporaries will ensure there's only a single write.
4148 */
4149
4150 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4151 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4152 {
4153 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4154
4155 switch (intr->intrinsic) {
4156 case nir_intrinsic_store_ssbo:
4157 case nir_intrinsic_store_shared:
4158 case nir_intrinsic_store_global:
4159 case nir_intrinsic_store_scratch:
4160 return true;
4161 default:
4162 return false;
4163 }
4164 }
4165
4166 /*
4167 * Some operations are only available as 32-bit instructions. 64-bit floats are
4168 * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and
4169 * 16-bit instructions, however, are lowered here.
4170 */
4171 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4172 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4173 {
4174 if (instr->type != nir_instr_type_alu)
4175 return 0;
4176
4177 nir_alu_instr *alu = nir_instr_as_alu(instr);
4178
4179 switch (alu->op) {
4180 case nir_op_fexp2:
4181 case nir_op_flog2:
4182 case nir_op_fpow:
4183 case nir_op_fsin:
4184 case nir_op_fcos:
4185 case nir_op_bit_count:
4186 case nir_op_bitfield_reverse:
4187 return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4188 default:
4189 return 0;
4190 }
4191 }
4192
4193 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4194 * transcendentals are an exception. Also shifts because of lane size mismatch
4195 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4196 * to be scalarized due to type size. */
4197
4198 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4199 bi_vectorize_filter(const nir_instr *instr, const void *data)
4200 {
4201 /* Defaults work for everything else */
4202 if (instr->type != nir_instr_type_alu)
4203 return 0;
4204
4205 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4206
4207 switch (alu->op) {
4208 case nir_op_frcp:
4209 case nir_op_frsq:
4210 case nir_op_ishl:
4211 case nir_op_ishr:
4212 case nir_op_ushr:
4213 case nir_op_f2i16:
4214 case nir_op_f2u16:
4215 case nir_op_extract_u8:
4216 case nir_op_extract_i8:
4217 case nir_op_extract_u16:
4218 case nir_op_extract_i16:
4219 case nir_op_insert_u16:
4220 return 1;
4221 default:
4222 break;
4223 }
4224
4225 /* Vectorized instructions cannot write more than 32-bit */
4226 int dst_bit_size = alu->def.bit_size;
4227 if (dst_bit_size == 16)
4228 return 2;
4229 else
4230 return 1;
4231 }
4232
4233 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4234 bi_scalarize_filter(const nir_instr *instr, const void *data)
4235 {
4236 if (instr->type != nir_instr_type_alu)
4237 return false;
4238
4239 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4240
4241 switch (alu->op) {
4242 case nir_op_pack_uvec2_to_uint:
4243 case nir_op_pack_uvec4_to_uint:
4244 return false;
4245 default:
4246 return true;
4247 }
4248 }
4249
4250 /* Ensure we write exactly 4 components */
4251 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4252 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4253 unsigned first, unsigned mask)
4254 {
4255 if (!(mask & BITFIELD_BIT(channel)))
4256 channel = first;
4257
4258 return nir_channel(b, in, channel);
4259 }
4260
4261 /* Lower fragment store_output instructions to always write 4 components,
4262 * matching the hardware semantic. This may require additional moves. Skipping
4263 * these moves is possible in theory, but invokes undefined behaviour in the
4264 * compiler. The DDK inserts these moves, so we will as well. */
4265
4266 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4267 bifrost_nir_lower_blend_components(struct nir_builder *b,
4268 nir_intrinsic_instr *intr, void *data)
4269 {
4270 if (intr->intrinsic != nir_intrinsic_store_output)
4271 return false;
4272
4273 nir_def *in = intr->src[0].ssa;
4274 unsigned first = nir_intrinsic_component(intr);
4275 unsigned mask = nir_intrinsic_write_mask(intr);
4276
4277 assert(first == 0 && "shouldn't get nonzero components");
4278
4279 /* Nothing to do */
4280 if (mask == BITFIELD_MASK(4))
4281 return false;
4282
4283 b->cursor = nir_before_instr(&intr->instr);
4284
4285 /* Replicate the first valid component instead */
4286 nir_def *replicated =
4287 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4288 bifrost_nir_valid_channel(b, in, 1, first, mask),
4289 bifrost_nir_valid_channel(b, in, 2, first, mask),
4290 bifrost_nir_valid_channel(b, in, 3, first, mask));
4291
4292 /* Rewrite to use our replicated version */
4293 nir_src_rewrite(&intr->src[0], replicated);
4294 nir_intrinsic_set_component(intr, 0);
4295 nir_intrinsic_set_write_mask(intr, 0xF);
4296 intr->num_components = 4;
4297
4298 return true;
4299 }
4300
4301 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)4302 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4303 uint8_t input_bit_size, uint32_t align,
4304 uint32_t align_offset, bool offset_is_const,
4305 const void *cb_data)
4306 {
4307 align = nir_combined_align(align, align_offset);
4308 assert(util_is_power_of_two_nonzero(align));
4309
4310 /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4311 * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4312 */
4313 unsigned bit_size = (bytes & 1) ? 8 : (bytes & 2) ? 16 : 32;
4314
4315 /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4316 * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4317 * the size.
4318 */
4319 if (align == 1)
4320 bit_size = 8;
4321 else if (align == 2)
4322 bit_size = MIN2(bit_size, 16);
4323
4324 return (nir_mem_access_size_align){
4325 .num_components = MIN2(bytes / (bit_size / 8), 4),
4326 .bit_size = bit_size,
4327 .align = bit_size / 8,
4328 };
4329 }
4330
4331 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4332 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4333 {
4334 bool progress;
4335
4336 do {
4337 progress = false;
4338
4339 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4340 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4341
4342 NIR_PASS(progress, nir, nir_copy_prop);
4343 NIR_PASS(progress, nir, nir_opt_remove_phis);
4344 NIR_PASS(progress, nir, nir_opt_dce);
4345 NIR_PASS(progress, nir, nir_opt_dead_cf);
4346 NIR_PASS(progress, nir, nir_opt_cse);
4347 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4348 NIR_PASS(progress, nir, nir_opt_algebraic);
4349 NIR_PASS(progress, nir, nir_opt_constant_folding);
4350
4351 NIR_PASS(progress, nir, nir_opt_undef);
4352 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4353
4354 NIR_PASS(progress, nir, nir_opt_shrink_vectors);
4355 NIR_PASS(progress, nir, nir_opt_loop_unroll);
4356 } while (progress);
4357
4358 /* TODO: Why is 64-bit getting rematerialized?
4359 * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4360 NIR_PASS(progress, nir, nir_lower_int64);
4361
4362 /* We need to cleanup after each iteration of late algebraic
4363 * optimizations, since otherwise NIR can produce weird edge cases
4364 * (like fneg of a constant) which we don't handle */
4365 bool late_algebraic = true;
4366 while (late_algebraic) {
4367 late_algebraic = false;
4368 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4369 NIR_PASS(progress, nir, nir_opt_constant_folding);
4370 NIR_PASS(progress, nir, nir_copy_prop);
4371 NIR_PASS(progress, nir, nir_opt_dce);
4372 NIR_PASS(progress, nir, nir_opt_cse);
4373 }
4374
4375 /* This opt currently helps on Bifrost but not Valhall */
4376 if (gpu_id < 0x9000)
4377 NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
4378
4379 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4380 NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4381 NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4382
4383 /* Prepass to simplify instruction selection */
4384 late_algebraic = false;
4385 NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4386
4387 while (late_algebraic) {
4388 late_algebraic = false;
4389 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4390 NIR_PASS(progress, nir, nir_opt_constant_folding);
4391 NIR_PASS(progress, nir, nir_copy_prop);
4392 NIR_PASS(progress, nir, nir_opt_dce);
4393 NIR_PASS(progress, nir, nir_opt_cse);
4394 }
4395
4396 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4397 NIR_PASS(progress, nir, nir_opt_dce);
4398
4399 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4400 NIR_PASS_V(nir, nir_shader_intrinsics_pass,
4401 bifrost_nir_lower_blend_components,
4402 nir_metadata_block_index | nir_metadata_dominance, NULL);
4403 }
4404
4405 /* Backend scheduler is purely local, so do some global optimizations
4406 * to reduce register pressure. */
4407 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
4408 nir_move_load_input | nir_move_comparisons |
4409 nir_move_copies | nir_move_load_ssbo;
4410
4411 NIR_PASS_V(nir, nir_opt_sink, move_all);
4412 NIR_PASS_V(nir, nir_opt_move, move_all);
4413
4414 /* We might lower attribute, varying, and image indirects. Use the
4415 * gathered info to skip the extra analysis in the happy path. */
4416 bool any_indirects = nir->info.inputs_read_indirectly ||
4417 nir->info.outputs_accessed_indirectly ||
4418 nir->info.patch_inputs_read_indirectly ||
4419 nir->info.patch_outputs_accessed_indirectly ||
4420 nir->info.images_used[0];
4421
4422 if (any_indirects) {
4423 nir_convert_to_lcssa(nir, true, true);
4424 NIR_PASS_V(nir, nir_divergence_analysis);
4425 NIR_PASS_V(nir, bi_lower_divergent_indirects,
4426 pan_subgroup_size(gpu_id >> 12));
4427 }
4428 }
4429
4430 static void
bi_opt_post_ra(bi_context * ctx)4431 bi_opt_post_ra(bi_context *ctx)
4432 {
4433 bi_foreach_instr_global_safe(ctx, ins) {
4434 if (ins->op == BI_OPCODE_MOV_I32 &&
4435 bi_is_equiv(ins->dest[0], ins->src[0]))
4436 bi_remove_instruction(ins);
4437 }
4438 }
4439
4440 /* Dead code elimination for branches at the end of a block - only one branch
4441 * per block is legal semantically, but unreachable jumps can be generated.
4442 * Likewise on Bifrost we can generate jumps to the terminal block which need
4443 * to be lowered away to a jump to #0x0, which induces successful termination.
4444 * That trick doesn't work on Valhall, which needs a NOP inserted in the
4445 * terminal block instead.
4446 */
4447 static void
bi_lower_branch(bi_context * ctx,bi_block * block)4448 bi_lower_branch(bi_context *ctx, bi_block *block)
4449 {
4450 bool cull_terminal = (ctx->arch <= 8);
4451 bool branched = false;
4452
4453 bi_foreach_instr_in_block_safe(block, ins) {
4454 if (!ins->branch_target)
4455 continue;
4456
4457 if (branched) {
4458 bi_remove_instruction(ins);
4459 continue;
4460 }
4461
4462 branched = true;
4463
4464 if (!bi_is_terminal_block(ins->branch_target))
4465 continue;
4466
4467 if (cull_terminal)
4468 ins->branch_target = NULL;
4469 else if (ins->branch_target)
4470 ins->branch_target->needs_nop = true;
4471 }
4472 }
4473
4474 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)4475 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4476 {
4477 unsigned final_clause = bi_pack(ctx, binary);
4478
4479 /* If we need to wait for ATEST or BLEND in the first clause, pass the
4480 * corresponding bits through to the renderer state descriptor */
4481 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4482 bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4483
4484 unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4485 ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4486 ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4487
4488 /* Pad the shader with enough zero bytes to trick the prefetcher,
4489 * unless we're compiling an empty shader (in which case we don't pad
4490 * so the size remains 0) */
4491 unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4492
4493 if (binary->size - offset) {
4494 memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
4495 prefetch_size);
4496 }
4497 }
4498
4499 /*
4500 * Build a bit mask of varyings (by location) that are flatshaded. This
4501 * information is needed by lower_mediump_io, as we don't yet support 16-bit
4502 * flat varyings.
4503 *
4504 * Also varyings that are used as texture coordinates should be kept at fp32 so
4505 * the texture instruction may be promoted to VAR_TEX. In general this is a good
4506 * idea, as fp16 texture coordinates are not supported by the hardware and are
4507 * usually inappropriate. (There are both relevant CTS bugs here, even.)
4508 *
4509 * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4510 * we could implement 16-bit flat varyings. Consider if this case matters.
4511 *
4512 * TODO: The texture coordinate handling could be less heavyhanded.
4513 */
4514 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)4515 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4516 {
4517 uint64_t *mask = data;
4518
4519 if (instr->type != nir_instr_type_tex)
4520 return false;
4521
4522 nir_tex_instr *tex = nir_instr_as_tex(instr);
4523
4524 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4525 if (coord_idx < 0)
4526 return false;
4527
4528 nir_src src = tex->src[coord_idx].src;
4529 nir_scalar x = nir_scalar_resolved(src.ssa, 0);
4530 nir_scalar y = nir_scalar_resolved(src.ssa, 1);
4531
4532 if (x.def != y.def)
4533 return false;
4534
4535 nir_instr *parent = x.def->parent_instr;
4536
4537 if (parent->type != nir_instr_type_intrinsic)
4538 return false;
4539
4540 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4541
4542 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4543 return false;
4544
4545 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4546 *mask |= BITFIELD64_BIT(sem.location);
4547 return false;
4548 }
4549
4550 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)4551 bi_fp32_varying_mask(nir_shader *nir)
4552 {
4553 uint64_t mask = 0;
4554
4555 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4556
4557 nir_foreach_shader_in_variable(var, nir) {
4558 if (var->data.interpolation == INTERP_MODE_FLAT)
4559 mask |= BITFIELD64_BIT(var->data.location);
4560 }
4561
4562 nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
4563 &mask);
4564
4565 return mask;
4566 }
4567
4568 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)4569 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
4570 void *data)
4571 {
4572 if (intr->intrinsic != nir_intrinsic_store_output)
4573 return false;
4574
4575 assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
4576 if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
4577 return false;
4578
4579 b->cursor = nir_before_instr(&intr->instr);
4580
4581 nir_def *orig = nir_load_sample_mask(b);
4582
4583 nir_src_rewrite(&intr->src[0],
4584 nir_b32csel(b, nir_load_multisampled_pan(b),
4585 nir_iand(b, orig, intr->src[0].ssa), orig));
4586 return true;
4587 }
4588
4589 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4590 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
4591 UNUSED void *data)
4592 {
4593 if (intr->intrinsic != nir_intrinsic_load_output)
4594 return false;
4595
4596 unsigned loc = nir_intrinsic_io_semantics(intr).location;
4597 assert(loc >= FRAG_RESULT_DATA0);
4598 unsigned rt = loc - FRAG_RESULT_DATA0;
4599
4600 b->cursor = nir_before_instr(&intr->instr);
4601
4602 nir_def *conversion = nir_load_rt_conversion_pan(
4603 b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
4604
4605 nir_def *lowered = nir_load_converted_output_pan(
4606 b, intr->def.num_components, intr->def.bit_size, conversion,
4607 .dest_type = nir_intrinsic_dest_type(intr),
4608 .io_semantics = nir_intrinsic_io_semantics(intr));
4609
4610 nir_def_rewrite_uses(&intr->def, lowered);
4611 return true;
4612 }
4613
4614 bool
bifrost_nir_lower_load_output(nir_shader * nir)4615 bifrost_nir_lower_load_output(nir_shader *nir)
4616 {
4617 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4618
4619 return nir_shader_intrinsics_pass(
4620 nir, bi_lower_load_output,
4621 nir_metadata_block_index | nir_metadata_dominance, NULL);
4622 }
4623
4624 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)4625 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
4626 {
4627 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
4628 * (so we don't accidentally duplicate the epilogue since mesa/st has
4629 * messed with our I/O quite a bit already) */
4630
4631 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4632
4633 if (nir->info.stage == MESA_SHADER_VERTEX) {
4634 NIR_PASS_V(nir, nir_lower_viewport_transform);
4635 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
4636
4637 nir_variable *psiz = nir_find_variable_with_location(
4638 nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
4639 if (psiz != NULL)
4640 psiz->data.precision = GLSL_PRECISION_MEDIUM;
4641 }
4642
4643 /* lower MSAA load/stores to 3D load/stores */
4644 NIR_PASS_V(nir, pan_nir_lower_image_ms);
4645
4646 /* Get rid of any global vars before we lower to scratch. */
4647 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
4648
4649 /* Valhall introduces packed thread local storage, which improves cache
4650 * locality of TLS access. However, access to packed TLS cannot
4651 * straddle 16-byte boundaries. As such, when packed TLS is in use
4652 * (currently unconditional for Valhall), we force vec4 alignment for
4653 * scratch access.
4654 */
4655 bool packed_tls = (gpu_id >= 0x9000);
4656
4657 /* Lower large arrays to scratch and small arrays to bcsel */
4658 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
4659 packed_tls ? glsl_get_vec4_size_align_bytes
4660 : glsl_get_natural_size_align_bytes);
4661 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
4662
4663 NIR_PASS_V(nir, nir_split_var_copies);
4664 NIR_PASS_V(nir, nir_lower_var_copies);
4665 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
4666 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
4667 glsl_type_size, 0);
4668
4669 /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
4670 * offsets it could figure out are constant. Do some constant folding
4671 * before bifrost_nir_lower_store_component below.
4672 */
4673 NIR_PASS_V(nir, nir_opt_constant_folding);
4674
4675 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4676 NIR_PASS_V(nir, nir_lower_mediump_io,
4677 nir_var_shader_in | nir_var_shader_out,
4678 ~bi_fp32_varying_mask(nir), false);
4679
4680 NIR_PASS_V(nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
4681 nir_metadata_block_index | nir_metadata_dominance, NULL);
4682
4683 NIR_PASS_V(nir, bifrost_nir_lower_load_output);
4684 } else if (nir->info.stage == MESA_SHADER_VERTEX) {
4685 if (gpu_id >= 0x9000) {
4686 NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
4687 BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
4688 }
4689
4690 NIR_PASS_V(nir, pan_nir_lower_store_component);
4691 }
4692
4693 nir_lower_mem_access_bit_sizes_options mem_size_options = {
4694 .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_constant |
4695 nir_var_mem_task_payload | nir_var_shader_temp |
4696 nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
4697 .callback = mem_access_size_align_cb,
4698 };
4699 NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
4700
4701 NIR_PASS_V(nir, nir_lower_ssbo);
4702 NIR_PASS_V(nir, pan_lower_sample_pos);
4703 NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
4704 NIR_PASS_V(nir, nir_lower_64bit_phis);
4705 NIR_PASS_V(nir, pan_lower_helper_invocation);
4706 NIR_PASS_V(nir, nir_lower_int64);
4707
4708 NIR_PASS_V(nir, nir_opt_idiv_const, 8);
4709 NIR_PASS_V(nir, nir_lower_idiv,
4710 &(nir_lower_idiv_options){.allow_fp16 = true});
4711
4712 NIR_PASS_V(nir, nir_lower_tex,
4713 &(nir_lower_tex_options){
4714 .lower_txs_lod = true,
4715 .lower_txp = ~0,
4716 .lower_tg4_broadcom_swizzle = true,
4717 .lower_txd = true,
4718 .lower_invalid_implicit_lod = true,
4719 .lower_index_to_offset = true,
4720 });
4721
4722 NIR_PASS_V(nir, nir_lower_image_atomics_to_global);
4723 NIR_PASS_V(nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4724 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
4725 NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
4726 NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
4727 NIR_PASS_V(nir, nir_lower_var_copies);
4728 NIR_PASS_V(nir, nir_lower_alu);
4729 NIR_PASS_V(nir, nir_lower_frag_coord_to_pixel_coord);
4730 }
4731
4732 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)4733 bi_compile_variant_nir(nir_shader *nir,
4734 const struct panfrost_compile_inputs *inputs,
4735 struct util_dynarray *binary, struct bi_shader_info info,
4736 enum bi_idvs_mode idvs)
4737 {
4738 bi_context *ctx = rzalloc(NULL, bi_context);
4739
4740 /* There may be another program in the dynarray, start at the end */
4741 unsigned offset = binary->size;
4742
4743 ctx->inputs = inputs;
4744 ctx->nir = nir;
4745 ctx->stage = nir->info.stage;
4746 ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
4747 ctx->arch = inputs->gpu_id >> 12;
4748 ctx->info = info;
4749 ctx->idvs = idvs;
4750 ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
4751
4752 if (idvs != BI_IDVS_NONE) {
4753 /* Specializing shaders for IDVS is destructive, so we need to
4754 * clone. However, the last (second) IDVS shader does not need
4755 * to be preserved so we can skip cloning that one.
4756 */
4757 if (offset == 0)
4758 ctx->nir = nir = nir_shader_clone(ctx, nir);
4759
4760 NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs,
4761 nir_metadata_block_index | nir_metadata_dominance, &idvs);
4762
4763 /* After specializing, clean up the mess */
4764 bool progress = true;
4765
4766 while (progress) {
4767 progress = false;
4768
4769 NIR_PASS(progress, nir, nir_opt_dce);
4770 NIR_PASS(progress, nir, nir_opt_dead_cf);
4771 }
4772 }
4773
4774 /* If nothing is pushed, all UBOs need to be uploaded */
4775 ctx->ubo_mask = ~0;
4776
4777 list_inithead(&ctx->blocks);
4778
4779 bool skip_internal = nir->info.internal;
4780 skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
4781
4782 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4783 nir_print_shader(nir, stdout);
4784 }
4785
4786 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
4787
4788 nir_foreach_function_impl(impl, nir) {
4789 nir_index_blocks(impl);
4790
4791 ctx->indexed_nir_blocks =
4792 rzalloc_array(ctx, bi_block *, impl->num_blocks);
4793
4794 ctx->ssa_alloc += impl->ssa_alloc;
4795
4796 emit_cf_list(ctx, &impl->body);
4797 bi_emit_phis_deferred(ctx);
4798 break; /* TODO: Multi-function shaders */
4799 }
4800
4801 /* Index blocks now that we're done emitting */
4802 bi_foreach_block(ctx, block) {
4803 block->index = ctx->num_blocks++;
4804 }
4805
4806 bi_validate(ctx, "NIR -> BIR");
4807
4808 /* If the shader doesn't write any colour or depth outputs, it may
4809 * still need an ATEST at the very end! */
4810 bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
4811 !ctx->emitted_atest && !bi_skip_atest(ctx, false);
4812
4813 if (need_dummy_atest) {
4814 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
4815 bi_builder b = bi_init_builder(ctx, bi_after_block(end));
4816 bi_emit_atest(&b, bi_zero());
4817 }
4818
4819 bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
4820
4821 /* Runs before constant folding */
4822 bi_lower_swizzle(ctx);
4823 bi_validate(ctx, "Early lowering");
4824
4825 /* Runs before copy prop */
4826 if (optimize && !ctx->inputs->no_ubo_to_push) {
4827 bi_opt_push_ubo(ctx);
4828 }
4829
4830 if (likely(optimize)) {
4831 bi_opt_copy_prop(ctx);
4832
4833 while (bi_opt_constant_fold(ctx))
4834 bi_opt_copy_prop(ctx);
4835
4836 bi_opt_mod_prop_forward(ctx);
4837 bi_opt_mod_prop_backward(ctx);
4838
4839 /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
4840 * mod_prop_backward to fuse VAR_TEX */
4841 if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
4842 !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
4843 bi_opt_dead_code_eliminate(ctx);
4844 bi_opt_message_preload(ctx);
4845 bi_opt_copy_prop(ctx);
4846 }
4847
4848 bi_opt_dead_code_eliminate(ctx);
4849 bi_opt_cse(ctx);
4850 bi_opt_dead_code_eliminate(ctx);
4851 if (!ctx->inputs->no_ubo_to_push)
4852 bi_opt_reorder_push(ctx);
4853 bi_validate(ctx, "Optimization passes");
4854 }
4855
4856 bi_lower_opt_instructions(ctx);
4857
4858 if (ctx->arch >= 9) {
4859 va_optimize(ctx);
4860 va_lower_isel(ctx);
4861
4862 bi_foreach_instr_global_safe(ctx, I) {
4863 /* Phis become single moves so shouldn't be affected */
4864 if (I->op == BI_OPCODE_PHI)
4865 continue;
4866
4867 va_lower_constants(ctx, I);
4868
4869 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
4870 va_repair_fau(&b, I);
4871 }
4872
4873 /* We need to clean up after constant lowering */
4874 if (likely(optimize)) {
4875 bi_opt_cse(ctx);
4876 bi_opt_dead_code_eliminate(ctx);
4877 }
4878
4879 bi_validate(ctx, "Valhall passes");
4880 }
4881
4882 bi_foreach_block(ctx, block) {
4883 bi_lower_branch(ctx, block);
4884 }
4885
4886 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4887 bi_print_shader(ctx, stdout);
4888
4889 /* Analyze before register allocation to avoid false dependencies. The
4890 * skip bit is a function of only the data flow graph and is invariant
4891 * under valid scheduling. Helpers are only defined for fragment
4892 * shaders, so this analysis is only required in fragment shaders.
4893 */
4894 if (ctx->stage == MESA_SHADER_FRAGMENT)
4895 bi_analyze_helper_requirements(ctx);
4896
4897 /* Fuse TEXC after analyzing helper requirements so the analysis
4898 * doesn't have to know about dual textures */
4899 if (likely(optimize)) {
4900 bi_opt_fuse_dual_texture(ctx);
4901 }
4902
4903 /* Lower FAU after fusing dual texture, because fusing dual texture
4904 * creates new immediates that themselves may need lowering.
4905 */
4906 if (ctx->arch <= 8) {
4907 bi_lower_fau(ctx);
4908 }
4909
4910 /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
4911 if (likely(optimize)) {
4912 bi_opt_cse(ctx);
4913 bi_opt_dead_code_eliminate(ctx);
4914 }
4915
4916 bi_validate(ctx, "Late lowering");
4917
4918 if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
4919 bi_pressure_schedule(ctx);
4920 bi_validate(ctx, "Pre-RA scheduling");
4921 }
4922
4923 bi_register_allocate(ctx);
4924
4925 if (likely(optimize))
4926 bi_opt_post_ra(ctx);
4927
4928 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4929 bi_print_shader(ctx, stdout);
4930
4931 if (ctx->arch >= 9) {
4932 va_assign_slots(ctx);
4933 va_insert_flow_control_nops(ctx);
4934 va_merge_flow(ctx);
4935 va_mark_last(ctx);
4936 } else {
4937 bi_schedule(ctx);
4938 bi_assign_scoreboard(ctx);
4939
4940 /* Analyze after scheduling since we depend on instruction
4941 * order. Valhall calls as part of va_insert_flow_control_nops,
4942 * as the handling for clauses differs from instructions.
4943 */
4944 bi_analyze_helper_terminate(ctx);
4945 bi_mark_clauses_td(ctx);
4946 }
4947
4948 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
4949 bi_print_shader(ctx, stdout);
4950
4951 if (ctx->arch <= 8) {
4952 bi_pack_clauses(ctx, binary, offset);
4953 } else {
4954 bi_pack_valhall(ctx, binary);
4955 }
4956
4957 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
4958 if (ctx->arch <= 8) {
4959 disassemble_bifrost(stdout, binary->data + offset,
4960 binary->size - offset,
4961 bifrost_debug & BIFROST_DBG_VERBOSE);
4962 } else {
4963 disassemble_valhall(stdout, binary->data + offset,
4964 binary->size - offset,
4965 bifrost_debug & BIFROST_DBG_VERBOSE);
4966 }
4967
4968 fflush(stdout);
4969 }
4970
4971 if (!skip_internal &&
4972 ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
4973 char *shaderdb;
4974
4975 if (ctx->arch >= 9) {
4976 shaderdb = va_print_stats(ctx, binary->size - offset);
4977 } else {
4978 shaderdb = bi_print_stats(ctx, binary->size - offset);
4979 }
4980
4981 if (bifrost_debug & BIFROST_DBG_SHADERDB)
4982 fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
4983
4984 if (inputs->debug)
4985 util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
4986
4987 ralloc_free(shaderdb);
4988 }
4989
4990 return ctx;
4991 }
4992
4993 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)4994 bi_compile_variant(nir_shader *nir,
4995 const struct panfrost_compile_inputs *inputs,
4996 struct util_dynarray *binary, struct pan_shader_info *info,
4997 enum bi_idvs_mode idvs)
4998 {
4999 struct bi_shader_info local_info = {
5000 .push = &info->push,
5001 .bifrost = &info->bifrost,
5002 .tls_size = info->tls_size,
5003 .push_offset = info->push.count,
5004 };
5005
5006 unsigned offset = binary->size;
5007
5008 /* If there is no position shader (gl_Position is not written), then
5009 * there is no need to build a varying shader either. This case is hit
5010 * for transform feedback only vertex shaders which only make sense with
5011 * rasterizer discard.
5012 */
5013 if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5014 return;
5015
5016 /* Software invariant: Only a secondary shader can appear at a nonzero
5017 * offset, to keep the ABI simple. */
5018 assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5019
5020 bi_context *ctx =
5021 bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5022
5023 /* A register is preloaded <==> it is live before the first block */
5024 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5025 uint64_t preload = first_block->reg_live_in;
5026
5027 /* If multisampling is used with a blend shader, the blend shader needs
5028 * to access the sample coverage mask in r60 and the sample ID in r61.
5029 * Blend shaders run in the same context as fragment shaders, so if a
5030 * blend shader could run, we need to preload these registers
5031 * conservatively. There is believed to be little cost to doing so, so
5032 * do so always to avoid variants of the preload descriptor.
5033 *
5034 * We only do this on Valhall, as Bifrost has to update the RSD for
5035 * multisampling w/ blend shader anyway, so this is handled in the
5036 * driver. We could unify the paths if the cost is acceptable.
5037 */
5038 if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5039 preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5040
5041 info->ubo_mask |= ctx->ubo_mask;
5042 info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5043
5044 if (idvs == BI_IDVS_VARYING) {
5045 info->vs.secondary_enable = (binary->size > offset);
5046 info->vs.secondary_offset = offset;
5047 info->vs.secondary_preload = preload;
5048 info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5049 } else {
5050 info->preload = preload;
5051 info->work_reg_count = ctx->info.work_reg_count;
5052 }
5053
5054 if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5055 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5056 /* Find the psiz write */
5057 bi_instr *write = NULL;
5058
5059 bi_foreach_instr_global(ctx, I) {
5060 if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5061 write = I;
5062 break;
5063 }
5064 }
5065
5066 assert(write != NULL);
5067
5068 /* NOP it out, preserving its flow control. TODO: maybe DCE */
5069 if (write->flow) {
5070 bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5071 bi_instr *nop = bi_nop(&b);
5072 nop->flow = write->flow;
5073 }
5074
5075 bi_remove_instruction(write);
5076
5077 info->vs.no_psiz_offset = binary->size;
5078 bi_pack_valhall(ctx, binary);
5079 }
5080
5081 ralloc_free(ctx);
5082 }
5083
5084 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5085 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5086 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5087 {
5088 /* Opt-out */
5089 if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5090 return false;
5091
5092 /* IDVS splits up vertex shaders, not defined on other shader stages */
5093 if (nir->info.stage != MESA_SHADER_VERTEX)
5094 return false;
5095
5096 /* Bifrost cannot write gl_PointSize during IDVS */
5097 if ((inputs->gpu_id < 0x9000) &&
5098 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5099 return false;
5100
5101 /* Otherwise, IDVS is usually better */
5102 return true;
5103 }
5104
5105 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5106 bifrost_compile_shader_nir(nir_shader *nir,
5107 const struct panfrost_compile_inputs *inputs,
5108 struct util_dynarray *binary,
5109 struct pan_shader_info *info)
5110 {
5111 bifrost_debug = debug_get_option_bifrost_debug();
5112
5113 /* Combine stores late, to give the driver a chance to lower dual-source
5114 * blending as regular store_output intrinsics.
5115 */
5116 NIR_PASS_V(nir, pan_nir_lower_zs_store);
5117
5118 bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5119
5120 info->tls_size = nir->scratch_size;
5121 info->vs.idvs = bi_should_idvs(nir, inputs);
5122
5123 pan_nir_collect_varyings(nir, info);
5124
5125 if (info->vs.idvs) {
5126 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5127 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5128 } else {
5129 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5130 }
5131
5132 if (gl_shader_stage_is_compute(nir->info.stage)) {
5133 /* Workgroups may be merged if the structure of the workgroup is
5134 * not software visible. This is true if neither shared memory
5135 * nor barriers are used. The hardware may be able to optimize
5136 * compute shaders that set this flag.
5137 */
5138 info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5139 !nir->info.uses_control_barrier &&
5140 !nir->info.uses_memory_barrier;
5141 }
5142
5143 info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5144 }
5145