1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors (Collabora):
25 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
26 */
27
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/glsl_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32
33 #include "bifrost/disassemble.h"
34 #include "panfrost/lib/pan_props.h"
35 #include "valhall/disassemble.h"
36 #include "valhall/va_compiler.h"
37 #include "bi_builder.h"
38 #include "bi_quirks.h"
39 #include "bifrost_compile.h"
40 #include "bifrost_nir.h"
41 #include "compiler.h"
42
43 /* clang-format off */
44 static const struct debug_named_value bifrost_debug_options[] = {
45 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
46 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
47 {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},
48 {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},
49 {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},
50 {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},
51 {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"},
52 {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},
53 {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
54 {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"},
55 {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"},
56 {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"},
57 {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"},
58 {"spill", BIFROST_DBG_SPILL, "Test register spilling"},
59 DEBUG_NAMED_VALUE_END
60 };
61 /* clang-format on */
62
63 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
64 bifrost_debug_options, 0)
65
66 /* How many bytes are prefetched by the Bifrost shader core. From the final
67 * clause of the shader, this range must be valid instructions or zero. */
68 #define BIFROST_SHADER_PREFETCH 128
69
70 int bifrost_debug = 0;
71
72 #define DBG(fmt, ...) \
73 do { \
74 if (bifrost_debug & BIFROST_DBG_MSGS) \
75 fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
76 } while (0)
77
78 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
79
80 static bi_index
bi_preload(bi_builder * b,unsigned reg)81 bi_preload(bi_builder *b, unsigned reg)
82 {
83 if (bi_is_null(b->shader->preloaded[reg])) {
84 /* Insert at the beginning of the shader */
85 bi_builder b_ = *b;
86 b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
87
88 /* Cache the result */
89 b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
90 }
91
92 return b->shader->preloaded[reg];
93 }
94
95 static bi_index
bi_coverage(bi_builder * b)96 bi_coverage(bi_builder *b)
97 {
98 if (bi_is_null(b->shader->coverage))
99 b->shader->coverage = bi_preload(b, 60);
100
101 return b->shader->coverage;
102 }
103
104 /*
105 * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
106 * changed from Bifrost to Valhall. Provide helpers that smooth over the
107 * architectural difference.
108 */
109 static inline bi_index
bi_vertex_id(bi_builder * b)110 bi_vertex_id(bi_builder *b)
111 {
112 return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
113 }
114
115 static inline bi_index
bi_instance_id(bi_builder * b)116 bi_instance_id(bi_builder *b)
117 {
118 return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
119 }
120
121 static inline bi_index
bi_draw_id(bi_builder * b)122 bi_draw_id(bi_builder *b)
123 {
124 assert(b->shader->arch >= 9);
125 return bi_preload(b, 62);
126 }
127
128 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)129 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
130 {
131 bi_instr *branch = bi_jump(b, bi_zero());
132
133 switch (instr->type) {
134 case nir_jump_break:
135 branch->branch_target = b->shader->break_block;
136 break;
137 case nir_jump_continue:
138 branch->branch_target = b->shader->continue_block;
139 break;
140 default:
141 unreachable("Unhandled jump type");
142 }
143
144 bi_block_add_successor(b->shader->current_block, branch->branch_target);
145 b->shader->current_block->unconditional_jumps = true;
146 }
147
148 /* Builds a 64-bit hash table key for an index */
149 static uint64_t
bi_index_to_key(bi_index idx)150 bi_index_to_key(bi_index idx)
151 {
152 static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
153
154 uint64_t key = 0;
155 memcpy(&key, &idx, sizeof(idx));
156 return key;
157 }
158
159 /*
160 * Extract a single channel out of a vector source. We split vectors with SPLIT
161 * so we can use the split components directly, without emitting an extract.
162 * This has advantages of RA, as the split can usually be optimized away.
163 */
164 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)165 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
166 {
167 bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
168 bi_index_to_key(vec));
169
170 /* No extract needed for scalars.
171 *
172 * This is a bit imprecise, but actual bugs (missing splits for vectors)
173 * should be caught by the following assertion. It is too difficult to
174 * ensure bi_extract is only called for real vectors.
175 */
176 if (components == NULL && channel == 0)
177 return vec;
178
179 assert(components != NULL && "missing bi_cache_collect()");
180 return components[channel];
181 }
182
183 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)184 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
185 {
186 /* Lifetime of a hash table entry has to be at least as long as the table */
187 bi_index *channels = ralloc_array(b->shader, bi_index, n);
188 memcpy(channels, s, sizeof(bi_index) * n);
189
190 _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
191 channels);
192 }
193
194 /*
195 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
196 * split pseudo-instruction.
197 *
198 * Pre-condition: dests is filled with bi_null().
199 */
200 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)201 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
202 {
203 /* Setup the destinations */
204 for (unsigned i = 0; i < n; ++i) {
205 dests[i] = bi_temp(b->shader);
206 }
207
208 /* Emit the split */
209 if (n == 1) {
210 bi_mov_i32_to(b, dests[0], vec);
211 } else {
212 bi_instr *I = bi_split_i32_to(b, n, vec);
213
214 bi_foreach_dest(I, j)
215 I->dest[j] = dests[j];
216 }
217 }
218
219 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)220 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
221 {
222 bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
223 bi_emit_split_i32(b, dests, vec, n);
224 bi_cache_collect(b, vec, dests, n);
225 }
226
227 /*
228 * Emit and cache a split for a vector of a given bitsize. The vector may not be
229 * composed of 32-bit words, but it will be split at 32-bit word boundaries.
230 */
231 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)232 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
233 {
234 bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
235 }
236
237 static void
bi_split_def(bi_builder * b,nir_def * def)238 bi_split_def(bi_builder *b, nir_def *def)
239 {
240 bi_emit_cached_split(b, bi_def_index(def),
241 def->bit_size * def->num_components);
242 }
243
244 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)245 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
246 {
247 /* Special case: COLLECT of a single value is a scalar move */
248 if (n == 1)
249 return bi_mov_i32_to(b, dst, chan[0]);
250
251 bi_instr *I = bi_collect_i32_to(b, dst, n);
252
253 bi_foreach_src(I, i)
254 I->src[i] = chan[i];
255
256 bi_cache_collect(b, dst, chan, n);
257 return I;
258 }
259
260 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)261 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
262 {
263 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
264 }
265
266 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)267 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
268 bi_index s2)
269 {
270 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
271 }
272
273 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)274 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
275 {
276 bi_index dst = bi_temp(b->shader);
277 bi_collect_v2i32_to(b, dst, s0, s1);
278 return dst;
279 }
280
281 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)282 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
283 {
284 switch (intr->intrinsic) {
285 case nir_intrinsic_load_barycentric_centroid:
286 case nir_intrinsic_load_barycentric_sample:
287 return bi_preload(b, 61);
288
289 /* Need to put the sample ID in the top 16-bits */
290 case nir_intrinsic_load_barycentric_at_sample:
291 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
292 bi_half(bi_src_index(&intr->src[0]), false));
293
294 /* Interpret as 8:8 signed fixed point positions in pixels along X and
295 * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
296 * is the center of the pixel so we first fixup and then convert. For
297 * fp16 input:
298 *
299 * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
300 * f2i16((256 * (x, y)) + (128, 128)) =
301 * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
302 *
303 * For fp32 input, that lacks enough precision for MSAA 16x, but the
304 * idea is the same. FIXME: still doesn't pass
305 */
306 case nir_intrinsic_load_barycentric_at_offset: {
307 bi_index offset = bi_src_index(&intr->src[0]);
308 bi_index f16 = bi_null();
309 unsigned sz = nir_src_bit_size(intr->src[0]);
310
311 if (sz == 16) {
312 f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
313 } else {
314 assert(sz == 32);
315 bi_index f[2];
316 for (unsigned i = 0; i < 2; ++i) {
317 f[i] =
318 bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
319 bi_imm_u32(8), BI_SPECIAL_NONE);
320 }
321
322 f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
323 }
324
325 return bi_v2f16_to_v2s16(b, f16);
326 }
327
328 case nir_intrinsic_load_barycentric_pixel:
329 default:
330 return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
331 }
332 }
333
334 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)335 bi_interp_for_intrinsic(nir_intrinsic_op op)
336 {
337 switch (op) {
338 case nir_intrinsic_load_barycentric_centroid:
339 return BI_SAMPLE_CENTROID;
340 case nir_intrinsic_load_barycentric_sample:
341 case nir_intrinsic_load_barycentric_at_sample:
342 return BI_SAMPLE_SAMPLE;
343 case nir_intrinsic_load_barycentric_at_offset:
344 return BI_SAMPLE_EXPLICIT;
345 case nir_intrinsic_load_barycentric_pixel:
346 default:
347 return BI_SAMPLE_CENTER;
348 }
349 }
350
351 /* auto, 64-bit omitted */
352 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)353 bi_reg_fmt_for_nir(nir_alu_type T)
354 {
355 switch (T) {
356 case nir_type_float16:
357 return BI_REGISTER_FORMAT_F16;
358 case nir_type_float32:
359 return BI_REGISTER_FORMAT_F32;
360 case nir_type_int16:
361 return BI_REGISTER_FORMAT_S16;
362 case nir_type_uint16:
363 return BI_REGISTER_FORMAT_U16;
364 case nir_type_int32:
365 return BI_REGISTER_FORMAT_S32;
366 case nir_type_uint32:
367 return BI_REGISTER_FORMAT_U32;
368 default:
369 unreachable("Invalid type for register format");
370 }
371 }
372
373 static bool
va_is_valid_const_narrow_index(bi_index idx)374 va_is_valid_const_narrow_index(bi_index idx)
375 {
376 if (idx.type != BI_INDEX_CONSTANT)
377 return false;
378
379 unsigned index = pan_res_handle_get_index(idx.value);
380 unsigned table_index = pan_res_handle_get_table(idx.value);
381
382 return index < 1024 && va_is_valid_const_table(table_index);
383 }
384
385 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
386 * immediate to be used (which applies even if _IMM can't be used) */
387
388 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)389 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
390 unsigned max)
391 {
392 nir_src *offset = nir_get_io_offset_src(instr);
393
394 if (!nir_src_is_const(*offset))
395 return false;
396
397 *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
398 return (*immediate) < max;
399 }
400
401 static bool
bi_is_imm_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate,unsigned max)402 bi_is_imm_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
403 uint32_t *immediate, unsigned max)
404 {
405 nir_src *offset = nir_get_io_offset_src(instr);
406
407 if (!nir_src_is_const(*offset))
408 return false;
409
410 if (b->shader->arch >= 9) {
411 uint32_t res_handle =
412 nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
413 uint32_t table_index = pan_res_handle_get_table(res_handle);
414 uint32_t res_index = pan_res_handle_get_index(res_handle);
415
416 if (!va_is_valid_const_table(table_index) || res_index >= max)
417 return false;
418
419 *immediate = res_handle;
420 return true;
421 }
422
423 return bi_is_intr_immediate(instr, immediate, max);
424 }
425
426 static bool
bi_is_imm_var_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate)427 bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
428 uint32_t *immediate)
429 {
430 unsigned max = b->shader->arch >= 9 ? 256 : 20;
431
432 return bi_is_imm_desc_handle(b, instr, immediate, max);
433 }
434
435 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
436 unsigned *channel, unsigned count, unsigned bitsize);
437
438 /* Bifrost's load instructions lack a component offset despite operating in
439 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
440 * but they may be unavoidable with separate shaders in use. To solve this, we
441 * lower to a larger load and an explicit copy of the desired components. */
442
443 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)444 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
445 {
446 unsigned component = nir_intrinsic_component(instr);
447 unsigned nr = instr->num_components;
448 unsigned total = nr + component;
449 unsigned bitsize = instr->def.bit_size;
450
451 assert(total <= 4 && "should be vec4");
452 bi_emit_cached_split(b, tmp, total * bitsize);
453
454 if (component == 0)
455 return;
456
457 bi_index srcs[] = {tmp, tmp, tmp};
458 unsigned channels[] = {component, component + 1, component + 2};
459
460 bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
461 instr->def.bit_size);
462 }
463
464 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)465 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
466 {
467 bi_index vertex_id =
468 instr->intrinsic == nir_intrinsic_load_attribute_pan ?
469 bi_src_index(&instr->src[0]) :
470 bi_vertex_id(b);
471 bi_index instance_id =
472 instr->intrinsic == nir_intrinsic_load_attribute_pan ?
473 bi_src_index(&instr->src[1]) :
474 bi_instance_id(b);
475
476 /* Disregard the signedness of an integer, since loading 32-bits into a
477 * 32-bit register should be bit exact so should not incur any clamping.
478 *
479 * If we are reading as a u32, then it must be paired with an integer (u32 or
480 * s32) source, so use .auto32 to disregard.
481 */
482 nir_alu_type T = nir_intrinsic_dest_type(instr);
483 assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
484 enum bi_register_format regfmt =
485 T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
486
487 nir_src *offset = nir_get_io_offset_src(instr);
488 unsigned component = nir_intrinsic_component(instr);
489 enum bi_vecsize vecsize = (instr->num_components + component - 1);
490 unsigned imm_index = 0;
491 unsigned base = nir_intrinsic_base(instr);
492 bool constant = nir_src_is_const(*offset);
493 bool immediate = bi_is_imm_desc_handle(b, instr, &imm_index, 16);
494 bi_index dest =
495 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
496 bi_instr *I;
497
498 if (immediate) {
499 I = bi_ld_attr_imm_to(b, dest, vertex_id, instance_id, regfmt,
500 vecsize, pan_res_handle_get_index(imm_index));
501
502 if (b->shader->arch >= 9)
503 I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
504 } else {
505 bi_index idx = bi_src_index(&instr->src[0]);
506
507 if (constant)
508 idx = bi_imm_u32(imm_index);
509 else if (base != 0)
510 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
511
512 I = bi_ld_attr_to(b, dest, vertex_id, instance_id, idx, regfmt, vecsize);
513 }
514
515 bi_copy_component(b, instr, dest);
516 }
517
518 /*
519 * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
520 * come later, sparsely packed. This handles both linked and separable shaders
521 * with a common code path, with minimal keying only for desktop GL. Each slot
522 * consumes 16 bytes (TODO: fp16, partial vectors).
523 */
524 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)525 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
526 {
527 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
528 uint32_t mask = ctx->inputs->fixed_varying_mask;
529
530 if (sem.location >= VARYING_SLOT_VAR0) {
531 unsigned nr_special = util_bitcount(mask);
532 unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
533
534 return 16 * (nr_special + general_index);
535 } else {
536 return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
537 }
538 }
539
540 /*
541 * Compute the offset in bytes of a varying with an immediate offset, adding the
542 * offset to the base computed above. Convenience method.
543 */
544 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)545 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
546 {
547 nir_src *src = nir_get_io_offset_src(intr);
548 assert(nir_src_is_const(*src) && "assumes immediate offset");
549
550 return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
551 }
552
553 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)554 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
555 {
556 enum bi_sample sample = BI_SAMPLE_CENTER;
557 enum bi_update update = BI_UPDATE_STORE;
558 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
559 bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
560 bi_index src0 = bi_null();
561
562 unsigned component = nir_intrinsic_component(instr);
563 enum bi_vecsize vecsize = (instr->num_components + component - 1);
564 bi_index dest =
565 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
566
567 unsigned sz = instr->def.bit_size;
568
569 if (smooth) {
570 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
571 assert(parent);
572
573 sample = bi_interp_for_intrinsic(parent->intrinsic);
574 src0 = bi_varying_src0_for_barycentric(b, parent);
575
576 assert(sz == 16 || sz == 32);
577 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
578 } else {
579 assert(sz == 32);
580 regfmt = BI_REGISTER_FORMAT_U32;
581
582 /* Valhall can't have bi_null() here, although the source is
583 * logically unused for flat varyings
584 */
585 if (b->shader->arch >= 9)
586 src0 = bi_preload(b, 61);
587
588 /* Gather info as we go */
589 b->shader->info.bifrost->uses_flat_shading = true;
590 }
591
592 nir_src *offset = nir_get_io_offset_src(instr);
593 unsigned imm_index = 0;
594 bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
595 unsigned base = nir_intrinsic_base(instr);
596
597 /* Only use LD_VAR_BUF[_IMM] if explicitly told by the driver
598 * through a compiler input value, falling back to LD_VAR[_IMM] +
599 * Attribute Descriptors otherwise. */
600 bool use_ld_var_buf =
601 b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf;
602
603 if (use_ld_var_buf) {
604 enum bi_source_format source_format =
605 smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
606
607 if (immediate) {
608 /* Immediate index given in bytes. */
609 bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
610 update, vecsize,
611 bi_varying_offset(b->shader, instr));
612 } else {
613 bi_index idx = bi_src_index(offset);
614 /* Index needs to be in bytes, but NIR gives the index
615 * in slots. For now assume 16 bytes per element.
616 */
617 bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
618 unsigned vbase = bi_varying_base_bytes(b->shader, instr);
619
620 if (vbase != 0)
621 idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
622
623 bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
624 source_format, update, vecsize);
625 }
626 } else {
627 /* On Valhall, ensure the table and index are valid for usage with
628 * immediate form when IDVS isn't used */
629 if (b->shader->arch >= 9)
630 immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
631 pan_res_handle_get_index(base) < 256;
632
633 if (immediate) {
634 bi_instr *I;
635
636 if (smooth) {
637 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
638 pan_res_handle_get_index(imm_index));
639 } else {
640 I =
641 bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
642 pan_res_handle_get_index(imm_index));
643 }
644
645 /* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple
646 * Midgard-style ABI. */
647 if (b->shader->arch >= 9)
648 I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
649 } else {
650 bi_index idx = bi_src_index(offset);
651
652 if (base != 0)
653 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
654
655 if (smooth)
656 bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
657 else
658 bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
659 }
660 }
661
662 bi_copy_component(b, instr, dest);
663 }
664
665 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)666 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
667 unsigned count)
668 {
669 assert(1 <= count && count <= 4);
670
671 bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
672
673 for (unsigned i = 0; i < count; ++i) {
674 unsigned chan = channel ? channel[i] : 0;
675 unsigned lane = chan & 3;
676 bi_index raw_data = bi_extract(b, src[i], chan >> 2);
677
678 /* On Bifrost, MKVEC.v4i8 cannot select b1 or b3 */
679 if (b->shader->arch < 9 && lane != 0 && lane != 2) {
680 bytes[i] = bi_byte(bi_rshift_or(b, 32, raw_data, bi_zero(),
681 bi_imm_u8(lane * 8), false),
682 0);
683 } else {
684 bytes[i] = bi_byte(raw_data, lane);
685 }
686
687 assert(b->shader->arch >= 9 || bytes[i].swizzle == BI_SWIZZLE_B0000 ||
688 bytes[i].swizzle == BI_SWIZZLE_B2222);
689 }
690
691 if (b->shader->arch >= 9) {
692 bi_index vec = bi_zero();
693
694 if (count >= 3)
695 vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
696
697 return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
698 } else {
699 return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
700 }
701 }
702
703 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)704 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
705 unsigned count)
706 {
707 unsigned chan0 = channel ? channel[0] : 0;
708 bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
709 bi_index h0 = bi_half(w0, chan0 & 1);
710
711 /* Zero extend */
712 if (count == 1)
713 return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
714
715 /* Else, create a vector */
716 assert(count == 2);
717
718 unsigned chan1 = channel ? channel[1] : 0;
719 bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
720 bi_index h1 = bi_half(w1, chan1 & 1);
721
722 if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
723 return bi_mov_i32(b, w0);
724 else if (bi_is_word_equiv(w0, w1))
725 return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
726 else
727 return bi_mkvec_v2i16(b, h0, h1);
728 }
729
730 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)731 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
732 unsigned count, unsigned bitsize)
733 {
734 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
735 unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
736 unsigned chan_per_word = 1 << shift;
737
738 assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
739 "unnecessarily large vector should have been lowered");
740
741 bi_index srcs[BI_MAX_VEC];
742
743 for (unsigned i = 0; i < count; i += chan_per_word) {
744 unsigned rem = MIN2(count - i, chan_per_word);
745 unsigned *channel_offset = channel ? (channel + i) : NULL;
746
747 if (bitsize == 32)
748 srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
749 else if (bitsize == 16)
750 srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
751 else
752 srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
753 }
754
755 bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
756 }
757
758 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)759 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
760 bi_index src1)
761 {
762 bi_instr *I;
763
764 if (b->shader->arch >= 9) {
765 I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
766 I->seg = BI_SEG_UBO;
767 } else {
768 I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
769 }
770
771 bi_emit_cached_split(b, dest0, bitsize);
772 return I;
773 }
774
775 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)776 bi_load_sample_id_to(bi_builder *b, bi_index dst)
777 {
778 /* r61[16:23] contains the sampleID, mask it out. Upper bits
779 * seem to read garbage (despite being architecturally defined
780 * as zero), so use a 5-bit mask instead of 8-bits */
781
782 bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
783 bi_imm_u8(16), false);
784 }
785
786 static bi_index
bi_load_sample_id(bi_builder * b)787 bi_load_sample_id(bi_builder *b)
788 {
789 bi_index sample_id = bi_temp(b->shader);
790 bi_load_sample_id_to(b, sample_id);
791 return sample_id;
792 }
793
794 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)795 bi_pixel_indices(bi_builder *b, unsigned rt)
796 {
797 /* We want to load the current pixel. */
798 struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
799
800 uint32_t indices_u32 = 0;
801 memcpy(&indices_u32, &pix, sizeof(indices_u32));
802 bi_index indices = bi_imm_u32(indices_u32);
803
804 /* Sample index above is left as zero. For multisampling, we need to
805 * fill in the actual sample ID in the lower byte */
806
807 if (b->shader->inputs->blend.nr_samples > 1)
808 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
809
810 return indices;
811 }
812
813 /* Source color is passed through r0-r3, or r4-r7 for the second source when
814 * dual-source blending. Preload the corresponding vector.
815 */
816 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)817 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
818 {
819 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
820 unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
821 unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
822 assert(size == 16 || size == 32);
823
824 bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
825 bi_preload(b, base + 2), bi_preload(b, base + 3)};
826
827 bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
828 }
829
830 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)831 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
832 nir_alu_type T2, unsigned rt)
833 {
834 /* Reads 2 or 4 staging registers to cover the input */
835 unsigned size = nir_alu_type_get_type_size(T);
836 unsigned size_2 = nir_alu_type_get_type_size(T2);
837 unsigned sr_count = (size <= 16) ? 2 : 4;
838 unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
839 const struct panfrost_compile_inputs *inputs = b->shader->inputs;
840 uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
841 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
842
843 /* Workaround for NIR-to-TGSI */
844 if (b->shader->nir->info.fs.untyped_color_outputs)
845 regfmt = BI_REGISTER_FORMAT_AUTO;
846
847 if (inputs->is_blend && inputs->blend.nr_samples > 1) {
848 /* Conversion descriptor comes from the compile inputs, pixel
849 * indices derived at run time based on sample ID */
850 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
851 bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
852 } else if (b->shader->inputs->is_blend) {
853 uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
854
855 /* Blend descriptor comes from the compile inputs */
856 /* Put the result in r0 */
857
858 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
859 bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
860 bi_null(), regfmt, sr_count, 0);
861 } else {
862 /* Blend descriptor comes from the FAU RAM. By convention, the
863 * return address on Bifrost is stored in r48 and will be used
864 * by the blend shader to jump back to the fragment shader */
865
866 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
867 bi_fau(BIR_FAU_BLEND_0 + rt, false),
868 bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
869 sr_count_2);
870 }
871
872 assert(rt < 8);
873 b->shader->info.bifrost->blend[rt].type = T;
874
875 if (T2)
876 b->shader->info.bifrost->blend_src1_type = T2;
877 }
878
879 /* Blend shaders do not need to run ATEST since they are dependent on a
880 * fragment shader that runs it. Blit shaders may not need to run ATEST, since
881 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
882 * there are no writes to the coverage mask. The latter two are satisfied for
883 * all blit shaders, so we just care about early-z, which blit shaders force
884 * iff they do not write depth or stencil */
885
886 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)887 bi_skip_atest(bi_context *ctx, bool emit_zs)
888 {
889 return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
890 }
891
892 static void
bi_emit_atest(bi_builder * b,bi_index alpha)893 bi_emit_atest(bi_builder *b, bi_index alpha)
894 {
895 b->shader->coverage =
896 bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
897 b->shader->emitted_atest = true;
898 }
899
900 static bi_index
bi_src_color_vec4(bi_builder * b,nir_src * src,nir_alu_type T)901 bi_src_color_vec4(bi_builder *b, nir_src *src, nir_alu_type T)
902 {
903 unsigned num_components = nir_src_num_components(*src);
904 bi_index base = bi_src_index(src);
905
906 /* short-circuit the common case */
907 if (num_components == 4)
908 return base;
909
910 unsigned size = nir_alu_type_get_type_size(T);
911 assert(size == 16 || size == 32);
912
913 bi_index src_vals[4];
914
915 unsigned i;
916 for (i = 0; i < num_components; i++)
917 src_vals[i] = bi_extract(b, base, i);
918
919 for (; i < 3; i++)
920 src_vals[i] = (size == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0);
921 src_vals[3] = (size == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0);
922 bi_index temp = bi_temp(b->shader);
923 bi_make_vec_to(b, temp, src_vals, NULL, 4, size);
924 return temp;
925 }
926
927 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)928 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
929 {
930 bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
931
932 unsigned writeout =
933 combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
934
935 bool emit_blend = writeout & (PAN_WRITEOUT_C);
936 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
937
938 unsigned loc = nir_intrinsic_io_semantics(instr).location;
939 bi_index src0 = bi_src_index(&instr->src[0]);
940
941 /* By ISA convention, the coverage mask is stored in R60. The store
942 * itself will be handled by a subsequent ATEST instruction */
943 if (loc == FRAG_RESULT_SAMPLE_MASK) {
944 b->shader->coverage = bi_extract(b, src0, 0);
945 return;
946 }
947
948 /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
949 * value, but render target #0 might not be floating point. However the
950 * alpha value is only used for alpha-to-coverage, a stage which is
951 * skipped for pure integer framebuffers, so the issue is moot. */
952
953 if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
954 nir_alu_type T = nir_intrinsic_src_type(instr);
955
956 bi_index rgba = bi_src_index(&instr->src[0]);
957 bi_index alpha;
958
959 if (nir_src_num_components(instr->src[0]) < 4) {
960 /* Don't read out-of-bounds */
961 alpha = bi_imm_f32(1.0);
962 } else if (T == nir_type_float16) {
963 alpha = bi_half(bi_extract(b, rgba, 1), true);
964 } else if (T == nir_type_float32) {
965 alpha = bi_extract(b, rgba, 3);
966 } else {
967 alpha = bi_dontcare(b);
968 }
969 bi_emit_atest(b, alpha);
970 }
971
972 if (emit_zs) {
973 bi_index z = bi_dontcare(b), s = bi_dontcare(b);
974
975 if (writeout & PAN_WRITEOUT_Z)
976 z = bi_src_index(&instr->src[2]);
977
978 if (writeout & PAN_WRITEOUT_S)
979 s = bi_src_index(&instr->src[3]);
980
981 b->shader->coverage =
982 bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
983 writeout & PAN_WRITEOUT_Z);
984 }
985
986 if (emit_blend) {
987 unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
988 bool dual = (writeout & PAN_WRITEOUT_2);
989 nir_alu_type T = nir_intrinsic_src_type(instr);
990 nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
991 bi_index color = bi_src_color_vec4(b, &instr->src[0], T);
992 bi_index color2 =
993 dual ? bi_src_color_vec4(b, &instr->src[4], T2) : bi_null();
994
995 if (instr->intrinsic == nir_intrinsic_store_output &&
996 loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
997 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
998
999 unsigned rt_offs = nir_src_as_uint(instr->src[1]);
1000
1001 assert(rt + rt_offs < 8 && "RT not in the [0-7] range");
1002 rt += rt_offs;
1003 }
1004
1005 /* Explicit copy since BLEND inputs are precoloured to R0-R3,
1006 * TODO: maybe schedule around this or implement in RA as a
1007 * spill */
1008 bool has_mrt =
1009 (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
1010
1011 if (has_mrt) {
1012 bi_index srcs[4] = {color, color, color, color};
1013 unsigned channels[4] = {0, 1, 2, 3};
1014 color = bi_temp(b->shader);
1015 bi_make_vec_to(
1016 b, color, srcs, channels, nir_src_num_components(instr->src[0]),
1017 nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
1018 }
1019
1020 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
1021 }
1022
1023 if (b->shader->inputs->is_blend) {
1024 /* Jump back to the fragment shader, return address is stored
1025 * in r48 (see above). On Valhall, only jump if the address is
1026 * nonzero. The check is free there and it implements the "jump
1027 * to 0 terminates the blend shader" that's automatic on
1028 * Bifrost.
1029 */
1030 if (b->shader->arch >= 8)
1031 bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
1032 else
1033 bi_jump(b, bi_preload(b, 48));
1034 }
1035 }
1036
1037 /**
1038 * In a vertex shader, is the specified variable a position output? These kinds
1039 * of outputs are written from position shaders when IDVS is enabled. All other
1040 * outputs are written from the varying shader.
1041 */
1042 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)1043 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
1044 {
1045 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1046
1047 switch (sem.location) {
1048 case VARYING_SLOT_POS:
1049 case VARYING_SLOT_PSIZ:
1050 case VARYING_SLOT_LAYER:
1051 return idvs == BI_IDVS_VARYING;
1052 default:
1053 return idvs == BI_IDVS_POSITION;
1054 }
1055 }
1056
1057 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)1058 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
1059 {
1060 enum bi_idvs_mode *idvs = data;
1061
1062 if (instr->type != nir_instr_type_intrinsic)
1063 return false;
1064
1065 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1066
1067 if (intr->intrinsic != nir_intrinsic_store_output &&
1068 intr->intrinsic != nir_intrinsic_store_per_view_output)
1069 return false;
1070
1071 if (bi_should_remove_store(intr, *idvs)) {
1072 nir_instr_remove(instr);
1073 return true;
1074 }
1075
1076 return false;
1077 }
1078
1079 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)1080 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
1081 {
1082 /* In principle we can do better for 16-bit. At the moment we require
1083 * 32-bit to permit the use of .auto, in order to force .u32 for flat
1084 * varyings, to handle internal TGSI shaders that set flat in the VS
1085 * but smooth in the FS */
1086
1087 ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
1088 ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
1089 assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
1090 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
1091
1092 unsigned imm_index = 0;
1093 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
1094
1095 /* Only look at the total components needed. In effect, we fill in all
1096 * the intermediate "holes" in the write mask, since we can't mask off
1097 * stores. Since nir_lower_io_to_temporaries ensures each varying is
1098 * written at most once, anything that's masked out is undefined, so it
1099 * doesn't matter what we write there. So we may as well do the
1100 * simplest thing possible. */
1101 unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1102 assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1103
1104 bi_index data = bi_src_index(&instr->src[0]);
1105
1106 /* To keep the vector dimensions consistent, we need to drop some
1107 * components. This should be coalesced.
1108 *
1109 * TODO: This is ugly and maybe inefficient. Would we rather
1110 * introduce a TRIM.i32 pseudoinstruction?
1111 */
1112 if (nr < nir_intrinsic_src_components(instr, 0)) {
1113 assert(T_size == 32 && "todo: 16-bit trim");
1114
1115 bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
1116 unsigned src_comps = nir_intrinsic_src_components(instr, 0);
1117
1118 bi_emit_split_i32(b, chans, data, src_comps);
1119
1120 bi_index tmp = bi_temp(b->shader);
1121 bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
1122
1123 bi_foreach_src(collect, w)
1124 collect->src[w] = chans[w];
1125
1126 data = tmp;
1127 }
1128
1129 bool psiz =
1130 (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1131 bool layer =
1132 (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_LAYER);
1133
1134 bi_index a[4] = {bi_null()};
1135
1136 if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1137 /* Bifrost position shaders have a fast path */
1138 assert(T == nir_type_float16 || T == nir_type_float32);
1139 unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1140 unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1141 unsigned snap4 = 0x5E;
1142 uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1143
1144 bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1145 bi_imm_u32(format), regfmt, nr - 1);
1146 } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1147 bi_index index = bi_preload(b, 59);
1148 unsigned index_offset = 0;
1149 unsigned pos_attr_offset = 0;
1150 unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
1151
1152 if (psiz || layer)
1153 index_offset += 4;
1154
1155 if (layer) {
1156 assert(nr == 1 && src_bit_sz == 32);
1157 src_bit_sz = 8;
1158 pos_attr_offset = 2;
1159 data = bi_byte(data, 0);
1160 }
1161
1162 if (psiz)
1163 assert(T_size == 16 && "should've been lowered");
1164
1165 bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1166
1167 if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
1168 unsigned view_index = nir_src_as_uint(instr->src[1]);
1169
1170 if (varying) {
1171 index_offset += view_index * 4;
1172 } else {
1173 /* We don't patch these offsets in the no_psiz variant, so if
1174 * multiview is enabled we can't switch to the basic format by
1175 * using no_psiz */
1176 bool extended_position_fifo = b->shader->nir->info.outputs_written &
1177 (VARYING_BIT_LAYER | VARYING_BIT_PSIZ);
1178 unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
1179 index_offset += view_index * position_fifo_stride;
1180 }
1181 }
1182
1183 if (index_offset != 0)
1184 index = bi_iadd_imm_i32(b, index, index_offset);
1185
1186 /* On Valhall, with IDVS varying are stored in a hardware-controlled
1187 * buffer through table 61 at index 0 */
1188 bi_index address = bi_temp(b->shader);
1189 bi_instr *I = bi_lea_buf_imm_to(b, address, index);
1190 I->table = va_res_fold_table_idx(61);
1191 I->index = 0;
1192 bi_emit_split_i32(b, a, address, 2);
1193
1194 bi_store(b, nr * src_bit_sz, data, a[0], a[1],
1195 varying ? BI_SEG_VARY : BI_SEG_POS,
1196 varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
1197 } else if (immediate) {
1198 bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1199 regfmt, imm_index);
1200 bi_emit_split_i32(b, a, address, 3);
1201
1202 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1203 } else {
1204 bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1205 bi_imm_u32(nir_intrinsic_base(instr)), false);
1206 bi_index address =
1207 bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1208 bi_emit_split_i32(b, a, address, 3);
1209
1210 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1211 }
1212 }
1213
1214 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1215 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1216 {
1217 nir_src *offset = nir_get_io_offset_src(instr);
1218
1219 bool offset_is_const = nir_src_is_const(*offset);
1220 bi_index dyn_offset = bi_src_index(offset);
1221 uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1222
1223 bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1224 bi_def_index(&instr->def),
1225 offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1226 bi_src_index(&instr->src[0]));
1227 }
1228
1229 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1230 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1231 {
1232 assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1233
1234 nir_src *offset = &instr->src[0];
1235 assert(!nir_intrinsic_base(instr) && "base must be zero");
1236 assert(!nir_intrinsic_range(instr) && "range must be zero");
1237 assert(nir_src_is_const(*offset) && "no indirect push constants");
1238 uint32_t base = nir_src_as_uint(*offset);
1239 assert((base & 3) == 0 && "unaligned push constants");
1240
1241 unsigned bits = instr->def.bit_size * instr->def.num_components;
1242
1243 unsigned n = DIV_ROUND_UP(bits, 32);
1244 assert(n <= 4);
1245 bi_index channels[4] = {bi_null()};
1246
1247 for (unsigned i = 0; i < n; ++i) {
1248 unsigned word = (base >> 2) + i;
1249
1250 channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1251 }
1252
1253 bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1254
1255 /* Update push->count to report the highest push constant word being accessed
1256 * by this shader.
1257 */
1258 b->shader->info.push->count =
1259 MAX2((base / 4) + n, b->shader->info.push->count);
1260 }
1261
1262 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1263 bi_addr_high(bi_builder *b, nir_src *src)
1264 {
1265 return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1266 : bi_zero();
1267 }
1268
1269 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1270 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1271 enum bi_seg seg, int16_t *offset)
1272 {
1273 /* Not needed on Bifrost or for global accesses */
1274 if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1275 return;
1276
1277 /* There is no segment modifier on Valhall. Instead, we need to
1278 * emit the arithmetic ourselves. We do have an offset
1279 * available, which saves an instruction for constant offsets.
1280 */
1281 bool wls = (seg == BI_SEG_WLS);
1282 assert(wls || (seg == BI_SEG_TL));
1283
1284 enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1285
1286 bi_index base_lo = bi_fau(fau, false);
1287
1288 if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1289 addr_lo->value == (int16_t)addr_lo->value) {
1290 *offset = addr_lo->value;
1291 *addr_lo = base_lo;
1292 } else {
1293 *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1294 }
1295
1296 /* Do not allow overflow for WLS or TLS */
1297 *addr_hi = bi_fau(fau, true);
1298 }
1299
1300 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1301 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1302 {
1303 int16_t offset = 0;
1304 unsigned bits = instr->num_components * instr->def.bit_size;
1305 bi_index dest = bi_def_index(&instr->def);
1306 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1307 bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1308
1309 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1310
1311 bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1312 bi_emit_cached_split(b, dest, bits);
1313 }
1314
1315 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1316 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1317 {
1318 /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1319 assert(nir_intrinsic_write_mask(instr) ==
1320 BITFIELD_MASK(instr->num_components));
1321
1322 int16_t offset = 0;
1323 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1324 bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1325
1326 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1327
1328 bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1329 bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1330 }
1331
1332 /* Exchanges the staging register with memory */
1333
1334 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1335 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1336 enum bi_seg seg)
1337 {
1338 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1339
1340 unsigned sz = nir_src_bit_size(*arg);
1341 assert(sz == 32 || sz == 64);
1342
1343 bi_index data = bi_src_index(arg);
1344
1345 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1346
1347 if (b->shader->arch >= 9)
1348 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1349 else if (seg == BI_SEG_WLS)
1350 addr_hi = bi_zero();
1351
1352 bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1353 }
1354
1355 /* Exchanges the second staging register with memory if comparison with first
1356 * staging register passes */
1357
1358 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1359 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1360 nir_src *arg_2, enum bi_seg seg)
1361 {
1362 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1363
1364 /* hardware is swapped from NIR */
1365 bi_index src0 = bi_src_index(arg_2);
1366 bi_index src1 = bi_src_index(arg_1);
1367
1368 unsigned sz = nir_src_bit_size(*arg_1);
1369 assert(sz == 32 || sz == 64);
1370
1371 bi_index data_words[] = {
1372 bi_extract(b, src0, 0),
1373 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1374
1375 /* 64-bit */
1376 bi_extract(b, src1, 0),
1377 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1378 };
1379
1380 bi_index in = bi_temp(b->shader);
1381 bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1382 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1383
1384 if (b->shader->arch >= 9)
1385 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1386 else if (seg == BI_SEG_WLS)
1387 addr_hi = bi_zero();
1388
1389 bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1390 bi_emit_cached_split(b, out, sz);
1391
1392 bi_index inout_words[] = {bi_extract(b, out, 0),
1393 sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1394
1395 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1396 }
1397
1398 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1399 bi_atom_opc_for_nir(nir_atomic_op op)
1400 {
1401 /* clang-format off */
1402 switch (op) {
1403 case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1404 case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1405 case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1406 case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1407 case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1408 case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1409 case nir_atomic_op_ior: return BI_ATOM_OPC_AOR;
1410 case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1411 default: unreachable("Unexpected computational atomic");
1412 }
1413 /* clang-format on */
1414 }
1415
1416 /* Optimized unary atomics are available with an implied #1 argument */
1417
1418 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1419 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1420 {
1421 /* Check we have a compatible constant */
1422 if (arg.type != BI_INDEX_CONSTANT)
1423 return false;
1424
1425 if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1426 return false;
1427
1428 /* Check for a compatible operation */
1429 switch (op) {
1430 case BI_ATOM_OPC_AADD:
1431 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1432 return true;
1433 case BI_ATOM_OPC_ASMAX:
1434 *out = BI_ATOM_OPC_ASMAX1;
1435 return true;
1436 case BI_ATOM_OPC_AUMAX:
1437 *out = BI_ATOM_OPC_AUMAX1;
1438 return true;
1439 case BI_ATOM_OPC_AOR:
1440 *out = BI_ATOM_OPC_AOR1;
1441 return true;
1442 default:
1443 return false;
1444 }
1445 }
1446
1447 /*
1448 * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1449 * translate between these forms (with MKVEC.v2i16).
1450 *
1451 * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1452 * arrays. For uniform handling, we also treat 3D textures like 2D arrays.
1453 *
1454 * Our indexing needs to reflects this. Since Valhall and Bifrost are quite
1455 * different, we provide separate functions for these.
1456 */
1457 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1458 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1459 unsigned coord_comps, bool is_array, bool is_msaa)
1460 {
1461 assert(coord_comps > 0 && coord_comps <= 3);
1462
1463 /* MSAA load store should have been lowered */
1464 assert(!is_msaa);
1465 if (src_idx == 0) {
1466 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1467 return bi_extract(b, coord, 0);
1468 else
1469 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1470 bi_half(bi_extract(b, coord, 1), false));
1471 } else {
1472 if (coord_comps == 3)
1473 return bi_extract(b, coord, 2);
1474 else if (coord_comps == 2 && is_array)
1475 return bi_extract(b, coord, 1);
1476 else
1477 return bi_zero();
1478 }
1479 }
1480
1481 static bi_index
va_emit_image_coord(bi_builder * b,bi_index coord,bi_index sample_index,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1482 va_emit_image_coord(bi_builder *b, bi_index coord, bi_index sample_index,
1483 unsigned src_idx, unsigned coord_comps, bool is_array,
1484 bool is_msaa)
1485 {
1486 assert(coord_comps > 0 && coord_comps <= 3);
1487 if (src_idx == 0) {
1488 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1489 return bi_extract(b, coord, 0);
1490 else
1491 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1492 bi_half(bi_extract(b, coord, 1), false));
1493 } else if (is_msaa) {
1494 bi_index array_idx = bi_extract(b, sample_index, 0);
1495 if (coord_comps == 3)
1496 return bi_mkvec_v2i16(b, bi_half(array_idx, false),
1497 bi_half(bi_extract(b, coord, 2), false));
1498 else if (coord_comps == 2)
1499 return array_idx;
1500 } else if (coord_comps == 3 && is_array) {
1501 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1502 bi_half(bi_extract(b, coord, 2), false));
1503 } else if (coord_comps == 3 && !is_array) {
1504 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 2), false),
1505 bi_imm_u16(0));
1506 } else if (coord_comps == 2 && is_array) {
1507 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1508 bi_half(bi_extract(b, coord, 1), false));
1509 }
1510 return bi_zero();
1511 }
1512
1513 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1514 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1515 {
1516 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1517 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1518 bool array =
1519 nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;
1520
1521 bi_index coords = bi_src_index(&instr->src[1]);
1522 bi_index indexvar = bi_src_index(&instr->src[2]);
1523 bi_index xy, zw;
1524 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS);
1525 if (b->shader->arch < 9) {
1526 xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1527 zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1528 } else {
1529 xy =
1530 va_emit_image_coord(b, coords, indexvar, 0, coord_comps, array, is_ms);
1531 zw =
1532 va_emit_image_coord(b, coords, indexvar, 1, coord_comps, array, is_ms);
1533 }
1534 bi_index dest = bi_def_index(&instr->def);
1535 enum bi_register_format regfmt =
1536 bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1537 enum bi_vecsize vecsize = instr->num_components - 1;
1538
1539 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1540 const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1541 const unsigned table_index = pan_res_handle_get_table(raw_value);
1542 const unsigned texture_index = pan_res_handle_get_index(raw_value);
1543
1544 if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1545 bi_instr *I =
1546 bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, texture_index);
1547 I->table = va_res_fold_table_idx(table_index);
1548 } else {
1549 bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1550 vecsize);
1551 }
1552 } else if (b->shader->arch >= 9) {
1553 bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1554 vecsize);
1555 } else {
1556 bi_ld_attr_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1557 vecsize);
1558 }
1559
1560 bi_split_def(b, &instr->def);
1561 }
1562
1563 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1564 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1565 {
1566 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1567 bool array =
1568 nir_intrinsic_image_array(instr) || dim == GLSL_SAMPLER_DIM_CUBE;
1569 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1570
1571 enum bi_register_format type =
1572 (instr->intrinsic == nir_intrinsic_image_store)
1573 ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1574 : BI_REGISTER_FORMAT_AUTO;
1575
1576 bi_index coords = bi_src_index(&instr->src[1]);
1577 bi_index indices = bi_src_index(&instr->src[2]);
1578 bi_index xy, zw;
1579 bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1580 if (b->shader->arch < 9) {
1581 xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1582 zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1583 } else {
1584 xy =
1585 va_emit_image_coord(b, coords, indices, 0, coord_comps, array, is_ms);
1586 zw =
1587 va_emit_image_coord(b, coords, indices, 1, coord_comps, array, is_ms);
1588 }
1589
1590 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1591 const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1592 unsigned table_index = pan_res_handle_get_table(raw_value);
1593 unsigned texture_index = pan_res_handle_get_index(raw_value);
1594
1595 if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1596 bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, texture_index);
1597 I->table = va_res_fold_table_idx(table_index);
1598 } else {
1599 bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1600 }
1601 } else if (b->shader->arch >= 9) {
1602 bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1603 } else {
1604 bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1605 bi_src_index(&instr->src[0]), type);
1606
1607 /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1608 * our ABI has all images in the primary attribute table
1609 */
1610 I->table = BI_TABLE_ATTRIBUTE_1;
1611 }
1612
1613 bi_emit_cached_split(b, dest, 3 * 32);
1614 }
1615
1616 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1617 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1618 {
1619 bi_index dest = bi_temp(b->shader);
1620 bi_emit_lea_image_to(b, dest, instr);
1621 return dest;
1622 }
1623
1624 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1625 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1626 {
1627 bi_index a[4] = {bi_null()};
1628 bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1629
1630 /* Due to SPIR-V limitations, the source type is not fully reliable: it
1631 * reports uint32 even for write_imagei. This causes an incorrect
1632 * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1633 * instead, which will match per the OpenCL spec. Of course this does
1634 * not work for 16-bit stores, but those are not available in OpenCL.
1635 */
1636 nir_alu_type T = nir_intrinsic_src_type(instr);
1637 assert(nir_alu_type_get_type_size(T) == 32);
1638
1639 bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1640 BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1641 }
1642
1643 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1644 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1645 nir_atomic_op op)
1646 {
1647 enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1648 enum bi_atom_opc post_opc = opc;
1649 bool bifrost = b->shader->arch <= 8;
1650
1651 /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1652 * take any vector but can still output in RETURN mode */
1653 bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1654 unsigned sr_count = bifrost ? 2 : 1;
1655
1656 /* Generate either ATOM or ATOM1 as required */
1657 if (bi_promote_atom_c1(opc, arg, &opc)) {
1658 bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1659 bi_extract(b, addr, 1), opc, sr_count);
1660 } else {
1661 bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1662 bi_extract(b, addr, 1), opc, sr_count);
1663 }
1664
1665 if (bifrost) {
1666 /* Post-process it */
1667 bi_emit_cached_split_i32(b, tmp_dest, 2);
1668 bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1669 bi_extract(b, tmp_dest, 1), post_opc);
1670 }
1671 }
1672
1673 static void
bi_emit_load_frag_coord_zw_pan(bi_builder * b,nir_intrinsic_instr * instr)1674 bi_emit_load_frag_coord_zw_pan(bi_builder *b, nir_intrinsic_instr *instr)
1675 {
1676 bi_index dst = bi_def_index(&instr->def);
1677 unsigned channel = nir_intrinsic_component(instr);
1678 nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
1679
1680 enum bi_sample sample = bi_interp_for_intrinsic(bary->intrinsic);
1681 bi_index src0 = bi_varying_src0_for_barycentric(b, bary);
1682
1683 /* .explicit is not supported with frag_z */
1684 if (channel == 2)
1685 assert(sample != BI_SAMPLE_EXPLICIT);
1686
1687 bi_ld_var_special_to(
1688 b, dst, src0, BI_REGISTER_FORMAT_F32, sample, BI_UPDATE_CLOBBER,
1689 (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1690 BI_VECSIZE_NONE);
1691 }
1692
1693 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1694 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1695 {
1696 bi_index dest = bi_def_index(&instr->def);
1697 nir_alu_type T = nir_intrinsic_dest_type(instr);
1698 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1699 unsigned size = instr->def.bit_size;
1700 unsigned nr = instr->num_components;
1701
1702 /* Get the render target */
1703 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1704 unsigned loc = sem.location;
1705 assert(loc >= FRAG_RESULT_DATA0);
1706 unsigned rt = (loc - FRAG_RESULT_DATA0);
1707
1708 bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1709 bi_src_index(&instr->src[0]), regfmt, nr - 1);
1710 bi_emit_cached_split(b, dest, size * nr);
1711 }
1712
1713 /*
1714 * Older Bifrost hardware has a limited CLPER instruction. Add a safe helper
1715 * that uses the hardware functionality if available and lowers otherwise.
1716 */
1717 static bi_index
bi_clper(bi_builder * b,bi_index s0,bi_index s1,enum bi_lane_op lop)1718 bi_clper(bi_builder *b, bi_index s0, bi_index s1, enum bi_lane_op lop)
1719 {
1720 if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
1721 if (lop == BI_LANE_OP_XOR) {
1722 bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1723 s1 = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1724 } else {
1725 assert(lop == BI_LANE_OP_NONE);
1726 }
1727
1728 return bi_clper_old_i32(b, s0, s1);
1729 } else {
1730 return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, lop,
1731 BI_SUBGROUP_SUBGROUP4);
1732 }
1733 }
1734
1735 static void
bi_emit_derivative(bi_builder * b,bi_index dst,nir_intrinsic_instr * instr,unsigned axis,bool coarse)1736 bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
1737 unsigned axis, bool coarse)
1738 {
1739 bi_index left, right;
1740 bi_index s0 = bi_src_index(&instr->src[0]);
1741 unsigned sz = instr->def.bit_size;
1742
1743 /* If all uses are fabs, the sign of the derivative doesn't matter. This is
1744 * inherently based on fine derivatives so we can't do it for coarse.
1745 */
1746 if (nir_def_all_uses_ignore_sign_bit(&instr->def) && !coarse) {
1747 left = s0;
1748 right = bi_clper(b, s0, bi_imm_u8(axis), BI_LANE_OP_XOR);
1749 } else {
1750 bi_index lane1, lane2;
1751 if (coarse) {
1752 lane1 = bi_imm_u32(0);
1753 lane2 = bi_imm_u32(axis);
1754 } else {
1755 lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
1756 bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
1757
1758 lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
1759 }
1760
1761 left = bi_clper(b, s0, bi_byte(lane1, 0), BI_LANE_OP_NONE);
1762 right = bi_clper(b, s0, bi_byte(lane2, 0), BI_LANE_OP_NONE);
1763 }
1764
1765 bi_fadd_to(b, sz, dst, right, bi_neg(left));
1766 }
1767
1768 static enum bi_subgroup
bi_subgroup_from_cluster_size(unsigned cluster_size)1769 bi_subgroup_from_cluster_size(unsigned cluster_size)
1770 {
1771 switch (cluster_size) {
1772 case 2: return BI_SUBGROUP_SUBGROUP2;
1773 case 4: return BI_SUBGROUP_SUBGROUP4;
1774 case 8: return BI_SUBGROUP_SUBGROUP8;
1775 case 16: return BI_SUBGROUP_SUBGROUP16;
1776 default: unreachable("Unsupported cluster size");
1777 }
1778 }
1779
1780 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1781 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1782 {
1783 bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1784 ? bi_def_index(&instr->def)
1785 : bi_null();
1786 gl_shader_stage stage = b->shader->stage;
1787
1788 switch (instr->intrinsic) {
1789 case nir_intrinsic_load_barycentric_pixel:
1790 case nir_intrinsic_load_barycentric_centroid:
1791 case nir_intrinsic_load_barycentric_sample:
1792 case nir_intrinsic_load_barycentric_at_sample:
1793 case nir_intrinsic_load_barycentric_at_offset:
1794 /* handled later via load_vary */
1795 break;
1796 case nir_intrinsic_load_attribute_pan:
1797 assert(stage == MESA_SHADER_VERTEX);
1798 bi_emit_load_attr(b, instr);
1799 break;
1800
1801 case nir_intrinsic_load_interpolated_input:
1802 case nir_intrinsic_load_input:
1803 if (b->shader->inputs->is_blend)
1804 bi_emit_load_blend_input(b, instr);
1805 else if (stage == MESA_SHADER_FRAGMENT)
1806 bi_emit_load_vary(b, instr);
1807 else if (stage == MESA_SHADER_VERTEX)
1808 bi_emit_load_attr(b, instr);
1809 else
1810 unreachable("Unsupported shader stage");
1811 break;
1812
1813 case nir_intrinsic_store_output:
1814 case nir_intrinsic_store_per_view_output:
1815 if (stage == MESA_SHADER_FRAGMENT)
1816 bi_emit_fragment_out(b, instr);
1817 else if (stage == MESA_SHADER_VERTEX)
1818 bi_emit_store_vary(b, instr);
1819 else
1820 unreachable("Unsupported shader stage");
1821 break;
1822
1823 case nir_intrinsic_store_combined_output_pan:
1824 assert(stage == MESA_SHADER_FRAGMENT);
1825 bi_emit_fragment_out(b, instr);
1826 break;
1827
1828 case nir_intrinsic_load_ubo:
1829 bi_emit_load_ubo(b, instr);
1830 break;
1831
1832 case nir_intrinsic_load_push_constant:
1833 bi_emit_load_push_constant(b, instr);
1834 break;
1835
1836 case nir_intrinsic_load_global:
1837 case nir_intrinsic_load_global_constant:
1838 bi_emit_load(b, instr, BI_SEG_NONE);
1839 break;
1840
1841 case nir_intrinsic_store_global:
1842 bi_emit_store(b, instr, BI_SEG_NONE);
1843 break;
1844
1845 case nir_intrinsic_load_scratch:
1846 bi_emit_load(b, instr, BI_SEG_TL);
1847 break;
1848
1849 case nir_intrinsic_store_scratch:
1850 bi_emit_store(b, instr, BI_SEG_TL);
1851 break;
1852
1853 case nir_intrinsic_load_shared:
1854 bi_emit_load(b, instr, BI_SEG_WLS);
1855 break;
1856
1857 case nir_intrinsic_store_shared:
1858 bi_emit_store(b, instr, BI_SEG_WLS);
1859 break;
1860
1861 case nir_intrinsic_barrier:
1862 switch (nir_intrinsic_execution_scope(instr)) {
1863 case SCOPE_NONE:
1864 /*
1865 * No execution barrier, and we don't have to do anything for memory
1866 * barriers (see SCOPE_WORKGROUP case.)
1867 */
1868 break;
1869
1870 case SCOPE_SUBGROUP:
1871 /*
1872 * To implement a subgroup barrier, we only need to prevent the
1873 * scheduler from reordering memory operations around the barrier.
1874 * Avail and vis are trivially established.
1875 */
1876 bi_memory_barrier(b);
1877 break;
1878
1879 case SCOPE_WORKGROUP:
1880 assert(b->shader->stage == MESA_SHADER_COMPUTE);
1881 bi_barrier(b);
1882 /*
1883 * Blob doesn't seem to do anything for memory barriers, so no need to
1884 * check nir_intrinsic_memory_scope().
1885 */
1886 break;
1887
1888 default:
1889 unreachable("Unsupported barrier scope");
1890 }
1891
1892 break;
1893
1894 case nir_intrinsic_shared_atomic: {
1895 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1896
1897 if (op == nir_atomic_op_xchg) {
1898 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1899 BI_SEG_WLS);
1900 } else {
1901 assert(nir_src_bit_size(instr->src[1]) == 32);
1902
1903 bi_index addr = bi_src_index(&instr->src[0]);
1904 bi_index addr_hi;
1905
1906 if (b->shader->arch >= 9) {
1907 bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1908 addr = bi_collect_v2i32(b, addr, addr_hi);
1909 } else {
1910 addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1911 bi_emit_cached_split(b, addr, 64);
1912 }
1913
1914 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1915 }
1916
1917 bi_split_def(b, &instr->def);
1918 break;
1919 }
1920
1921 case nir_intrinsic_global_atomic: {
1922 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1923
1924 if (op == nir_atomic_op_xchg) {
1925 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1926 BI_SEG_NONE);
1927 } else {
1928 assert(nir_src_bit_size(instr->src[1]) == 32);
1929
1930 bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1931 bi_src_index(&instr->src[1]), op);
1932 }
1933
1934 bi_split_def(b, &instr->def);
1935 break;
1936 }
1937
1938 case nir_intrinsic_image_texel_address:
1939 bi_emit_lea_image_to(b, dst, instr);
1940 break;
1941
1942 case nir_intrinsic_image_load:
1943 bi_emit_image_load(b, instr);
1944 break;
1945
1946 case nir_intrinsic_image_store:
1947 bi_emit_image_store(b, instr);
1948 break;
1949
1950 case nir_intrinsic_global_atomic_swap:
1951 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1952 &instr->src[2], BI_SEG_NONE);
1953 bi_split_def(b, &instr->def);
1954 break;
1955
1956 case nir_intrinsic_shared_atomic_swap:
1957 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1958 &instr->src[2], BI_SEG_WLS);
1959 bi_split_def(b, &instr->def);
1960 break;
1961
1962 case nir_intrinsic_load_pixel_coord:
1963 /* Vectorized load of the preloaded i16vec2 */
1964 bi_mov_i32_to(b, dst, bi_preload(b, 59));
1965 break;
1966
1967 case nir_intrinsic_load_frag_coord_zw_pan:
1968 bi_emit_load_frag_coord_zw_pan(b, instr);
1969 break;
1970
1971 case nir_intrinsic_load_converted_output_pan:
1972 bi_emit_ld_tile(b, instr);
1973 break;
1974
1975 case nir_intrinsic_terminate_if:
1976 bi_discard_b32(b, bi_src_index(&instr->src[0]));
1977 break;
1978
1979 case nir_intrinsic_terminate:
1980 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1981 break;
1982
1983 case nir_intrinsic_load_sample_positions_pan:
1984 bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1985 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1986 break;
1987
1988 case nir_intrinsic_load_sample_mask_in:
1989 /* r61[0:15] contains the coverage bitmap */
1990 bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1991 break;
1992
1993 case nir_intrinsic_load_sample_mask:
1994 bi_mov_i32_to(b, dst, bi_coverage(b));
1995 break;
1996
1997 case nir_intrinsic_load_sample_id:
1998 bi_load_sample_id_to(b, dst);
1999 break;
2000
2001 case nir_intrinsic_load_front_face:
2002 /* r58 == 0 means primitive is front facing */
2003 bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
2004 BI_RESULT_TYPE_M1);
2005 break;
2006
2007 case nir_intrinsic_load_point_coord:
2008 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
2009 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
2010 BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
2011 bi_emit_cached_split_i32(b, dst, 2);
2012 break;
2013
2014 /* It appears vertex_id is zero-based with Bifrost geometry flows, but
2015 * not with Valhall's memory-allocation IDVS geometry flow. We only support
2016 * the new flow on Valhall so this is lowered in NIR.
2017 */
2018 case nir_intrinsic_load_vertex_id:
2019 assert(b->shader->malloc_idvs);
2020 bi_mov_i32_to(b, dst, bi_vertex_id(b));
2021 break;
2022
2023 case nir_intrinsic_load_raw_vertex_id_pan:
2024 assert(!b->shader->malloc_idvs);
2025 bi_mov_i32_to(b, dst, bi_vertex_id(b));
2026 break;
2027
2028 case nir_intrinsic_load_instance_id:
2029 bi_mov_i32_to(b, dst, bi_instance_id(b));
2030 break;
2031
2032 case nir_intrinsic_load_draw_id:
2033 bi_mov_i32_to(b, dst, bi_draw_id(b));
2034 break;
2035
2036 case nir_intrinsic_load_subgroup_invocation:
2037 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
2038 break;
2039
2040 case nir_intrinsic_ballot:
2041 case nir_intrinsic_ballot_relaxed: {
2042 enum bi_subgroup subgroup =
2043 bi_subgroup_from_cluster_size(pan_subgroup_size(b->shader->arch));
2044 bi_wmask_to(b, dst, bi_src_index(&instr->src[0]), subgroup, 0);
2045 break;
2046 }
2047
2048 case nir_intrinsic_read_invocation: {
2049 enum bi_inactive_result inactive_result = BI_INACTIVE_RESULT_ZERO;
2050 enum bi_lane_op lane_op = BI_LANE_OP_NONE;
2051 enum bi_subgroup subgroup =
2052 bi_subgroup_from_cluster_size(pan_subgroup_size(b->shader->arch));
2053 bi_clper_i32_to(b, dst,
2054 bi_src_index(&instr->src[0]),
2055 bi_byte(bi_src_index(&instr->src[1]), 0),
2056 inactive_result, lane_op, subgroup);
2057 break;
2058 }
2059
2060 case nir_intrinsic_load_local_invocation_id:
2061 bi_collect_v3i32_to(b, dst,
2062 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
2063 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
2064 bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
2065 break;
2066
2067 case nir_intrinsic_load_workgroup_id:
2068 bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
2069 bi_preload(b, 59));
2070 break;
2071
2072 case nir_intrinsic_load_global_invocation_id:
2073 bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
2074 bi_preload(b, 62));
2075 break;
2076
2077 case nir_intrinsic_shader_clock:
2078 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
2079 bi_split_def(b, &instr->def);
2080 break;
2081
2082 case nir_intrinsic_ddx:
2083 case nir_intrinsic_ddx_fine:
2084 bi_emit_derivative(b, dst, instr, 1, false);
2085 break;
2086 case nir_intrinsic_ddx_coarse:
2087 bi_emit_derivative(b, dst, instr, 1, true);
2088 break;
2089 case nir_intrinsic_ddy:
2090 case nir_intrinsic_ddy_fine:
2091 bi_emit_derivative(b, dst, instr, 2, false);
2092 break;
2093 case nir_intrinsic_ddy_coarse:
2094 bi_emit_derivative(b, dst, instr, 2, true);
2095 break;
2096
2097 case nir_intrinsic_load_view_index:
2098 case nir_intrinsic_load_layer_id:
2099 assert(b->shader->arch >= 9);
2100 bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
2101 break;
2102
2103 case nir_intrinsic_load_ssbo_address:
2104 assert(b->shader->arch >= 9);
2105 bi_lea_buffer_to(b, dst, bi_src_index(&instr->src[1]),
2106 bi_src_index(&instr->src[0]));
2107 bi_emit_cached_split(b, dst, 64);
2108 break;
2109
2110 case nir_intrinsic_load_ssbo: {
2111 assert(b->shader->arch >= 9);
2112 unsigned dst_bits = instr->num_components * instr->def.bit_size;
2113 bi_ld_buffer_to(b, dst_bits, dst, bi_src_index(&instr->src[1]),
2114 bi_src_index(&instr->src[0]));
2115 bi_emit_cached_split(b, dst, dst_bits);
2116 break;
2117 }
2118
2119 case nir_intrinsic_as_uniform:
2120 /*
2121 * We don't have uniform registers (registers shared by all threads in the
2122 * warp) like some other hardware does so this is just a simple mov for
2123 * us.
2124 */
2125 bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0]));
2126 break;
2127
2128 default:
2129 fprintf(stderr, "Unhandled intrinsic %s\n",
2130 nir_intrinsic_infos[instr->intrinsic].name);
2131 assert(0);
2132 }
2133 }
2134
2135 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)2136 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
2137 {
2138 /* Make sure we've been lowered */
2139 assert(instr->def.num_components <= (32 / instr->def.bit_size));
2140
2141 /* Accumulate all the channels of the constant, as if we did an
2142 * implicit SEL over them */
2143 uint32_t acc = 0;
2144
2145 for (unsigned i = 0; i < instr->def.num_components; ++i) {
2146 unsigned v =
2147 nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
2148 acc |= (v << (i * instr->def.bit_size));
2149 }
2150
2151 bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
2152 }
2153
2154 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)2155 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
2156 {
2157 unsigned bitsize = nir_src_bit_size(src.src);
2158
2159 /* the bi_index carries the 32-bit (word) offset separate from the
2160 * subword swizzle, first handle the offset */
2161
2162 unsigned offset = 0;
2163
2164 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
2165 unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
2166
2167 for (unsigned i = 0; i < comps; ++i) {
2168 unsigned new_offset = (src.swizzle[i] >> subword_shift);
2169
2170 if (i > 0)
2171 assert(offset == new_offset && "wrong vectorization");
2172
2173 offset = new_offset;
2174 }
2175
2176 bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
2177
2178 /* Compose the subword swizzle with existing (identity) swizzle */
2179 assert(idx.swizzle == BI_SWIZZLE_H01);
2180
2181 /* Bigger vectors should have been lowered */
2182 assert(comps <= (1 << subword_shift));
2183
2184 if (bitsize == 16) {
2185 unsigned c0 = src.swizzle[0] & 1;
2186 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
2187 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
2188 } else if (bitsize == 8 && comps == 1) {
2189 idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
2190 } else if (bitsize == 8) {
2191 /* XXX: Use optimized swizzle when posisble */
2192 bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
2193 unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};
2194
2195 for (unsigned i = 0; i < comps; ++i) {
2196 unoffset_srcs[i] = bi_src_index(&src.src);
2197 channels[i] = src.swizzle[i];
2198 }
2199
2200 bi_index temp = bi_temp(b->shader);
2201 bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);
2202
2203 static const enum bi_swizzle swizzle_lut[] = {
2204 BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_H01, BI_SWIZZLE_H01};
2205 assert(comps - 1 < ARRAY_SIZE(swizzle_lut));
2206
2207 /* Assign a coherent swizzle for the vector */
2208 temp.swizzle = swizzle_lut[comps - 1];
2209
2210 return temp;
2211 }
2212
2213 return idx;
2214 }
2215
2216 static enum bi_round
bi_nir_round(nir_op op)2217 bi_nir_round(nir_op op)
2218 {
2219 switch (op) {
2220 case nir_op_fround_even:
2221 return BI_ROUND_NONE;
2222 case nir_op_ftrunc:
2223 return BI_ROUND_RTZ;
2224 case nir_op_fceil:
2225 return BI_ROUND_RTP;
2226 case nir_op_ffloor:
2227 return BI_ROUND_RTN;
2228 default:
2229 unreachable("invalid nir round op");
2230 }
2231 }
2232
2233 /* Convenience for lowered transcendentals */
2234
2235 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)2236 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2237 {
2238 return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2239 }
2240
2241 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2242 * Newton-Raphson to improve precision */
2243
2244 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)2245 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2246 {
2247 bi_index x1 = bi_frcp_approx_f32(b, s0);
2248 bi_index m = bi_frexpm_f32(b, s0, false, false);
2249 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
2250 bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
2251 BI_SPECIAL_N);
2252 bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2253 }
2254
2255 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)2256 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2257 {
2258 bi_index x1 = bi_frsq_approx_f32(b, s0);
2259 bi_index m = bi_frexpm_f32(b, s0, false, true);
2260 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
2261 bi_index t1 = bi_fmul_f32(b, x1, x1);
2262 bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2263 bi_imm_u32(-1), BI_SPECIAL_N);
2264 bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2265 }
2266
2267 /* More complex transcendentals, see
2268 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2269 * for documentation */
2270
2271 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)2272 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2273 {
2274 bi_index t1 = bi_temp(b->shader);
2275 bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2276 t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2277
2278 bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2279
2280 bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2281 a2->clamp = BI_CLAMP_CLAMP_M1_1;
2282
2283 bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2284 bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2285 bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2286 bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2287 bi_imm_u32(0x3e75fffa));
2288 bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2289 bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2290 bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
2291 BI_SPECIAL_NONE);
2292 x->clamp = BI_CLAMP_CLAMP_0_INF;
2293
2294 bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2295 max->sem = BI_SEM_NAN_PROPAGATE;
2296 }
2297
2298 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)2299 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2300 {
2301 /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2302 * fixed-point input */
2303 bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2304 bi_imm_u32(24), BI_SPECIAL_NONE);
2305 bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2306 fixed_pt->round = BI_ROUND_NONE; // XXX
2307
2308 /* Compute the result for the fixed-point input, but pass along
2309 * the floating-point scale for correct NaN propagation */
2310 bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2311 }
2312
2313 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2314 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2315 {
2316 /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2317 bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2318 bi_index ei = bi_frexpe_f32(b, s0, true, false);
2319 bi_index ef = bi_s32_to_f32(b, ei);
2320
2321 /* xt estimates -log(r1), a coarse approximation of log(a1) */
2322 bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2323 bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2324
2325 /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2326 * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2327 * and then log(s0) = x1 + x2 */
2328 bi_index x1 = bi_fadd_f32(b, ef, xt);
2329
2330 /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2331 * polynomial approximation around 1. The series is expressed around
2332 * 1, so set y = (a1 * r1) - 1.0 */
2333 bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2334
2335 /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2336 * log_e(1 + y) by the Taylor series (lower precision than the blob):
2337 * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2338 bi_index loge =
2339 bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2340
2341 bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2342
2343 /* log(s0) = x1 + x2 */
2344 bi_fadd_f32_to(b, dst, x1, x2);
2345 }
2346
2347 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2348 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2349 {
2350 bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2351 bi_index frexpi = bi_s32_to_f32(b, frexp);
2352 bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2353 bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2354 }
2355
2356 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2357 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2358 {
2359 bi_index log2_base = bi_null();
2360
2361 if (base.type == BI_INDEX_CONSTANT) {
2362 log2_base = bi_imm_f32(log2f(uif(base.value)));
2363 } else {
2364 log2_base = bi_temp(b->shader);
2365 bi_lower_flog2_32(b, log2_base, base);
2366 }
2367
2368 return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2369 }
2370
2371 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2372 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2373 {
2374 bi_index log2_base = bi_null();
2375
2376 if (base.type == BI_INDEX_CONSTANT) {
2377 log2_base = bi_imm_f32(log2f(uif(base.value)));
2378 } else {
2379 log2_base = bi_temp(b->shader);
2380 bi_flog2_32(b, log2_base, base);
2381 }
2382
2383 return bi_fexp_32(b, dst, exp, log2_base);
2384 }
2385
2386 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2387 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2388 * calculates the results. We use them to calculate sin/cos via a Taylor
2389 * approximation:
2390 *
2391 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2392 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2393 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2394 */
2395
2396 #define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)
2397 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2398 #define SINCOS_BIAS bi_imm_u32(0x49400000)
2399
2400 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)2401 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2402 {
2403 /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2404 bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2405
2406 /* Approximate domain error (small) */
2407 bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2408 MPI_OVER_TWO, s0);
2409
2410 /* Lookup sin(x), cos(x) */
2411 bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2412 bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2413
2414 /* e^2 / 2 */
2415 bi_index e2_over_2 =
2416 bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2417
2418 /* (-e^2)/2 f''(x) */
2419 bi_index quadratic =
2420 bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2421
2422 /* e f'(x) - (e^2/2) f''(x) */
2423 bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2424 cos ? bi_neg(sinx) : cosx, quadratic);
2425 I->clamp = BI_CLAMP_CLAMP_M1_1;
2426
2427 /* f(x) + e f'(x) - (e^2/2) f''(x) */
2428 bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2429 }
2430
2431 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2432 bi_translate_cmpf(nir_op op)
2433 {
2434 switch (op) {
2435 case nir_op_ieq8:
2436 case nir_op_ieq16:
2437 case nir_op_ieq32:
2438 case nir_op_feq16:
2439 case nir_op_feq32:
2440 return BI_CMPF_EQ;
2441
2442 case nir_op_ine8:
2443 case nir_op_ine16:
2444 case nir_op_ine32:
2445 case nir_op_fneu16:
2446 case nir_op_fneu32:
2447 return BI_CMPF_NE;
2448
2449 case nir_op_ilt8:
2450 case nir_op_ilt16:
2451 case nir_op_ilt32:
2452 case nir_op_flt16:
2453 case nir_op_flt32:
2454 case nir_op_ult8:
2455 case nir_op_ult16:
2456 case nir_op_ult32:
2457 return BI_CMPF_LT;
2458
2459 case nir_op_ige8:
2460 case nir_op_ige16:
2461 case nir_op_ige32:
2462 case nir_op_fge16:
2463 case nir_op_fge32:
2464 case nir_op_uge8:
2465 case nir_op_uge16:
2466 case nir_op_uge32:
2467 return BI_CMPF_GE;
2468
2469 default:
2470 unreachable("invalid comparison");
2471 }
2472 }
2473
2474 static bool
bi_nir_is_replicated(nir_alu_src * src)2475 bi_nir_is_replicated(nir_alu_src *src)
2476 {
2477 for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2478 if (src->swizzle[0] != src->swizzle[i])
2479 return false;
2480 }
2481
2482 return true;
2483 }
2484
2485 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2486 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2487 {
2488 bi_index dst = bi_def_index(&instr->def);
2489 unsigned srcs = nir_op_infos[instr->op].num_inputs;
2490 unsigned sz = instr->def.bit_size;
2491 unsigned comps = instr->def.num_components;
2492 unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2493
2494 /* Indicate scalarness */
2495 if (sz == 16 && comps == 1)
2496 dst.swizzle = BI_SWIZZLE_H00;
2497
2498 /* First, match against the various moves in NIR. These are
2499 * special-cased because they can operate on vectors even after
2500 * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2501 * instruction is no "bigger" than SIMD-within-a-register. These moves
2502 * are the exceptions that need to handle swizzles specially. */
2503
2504 switch (instr->op) {
2505 case nir_op_vec2:
2506 case nir_op_vec3:
2507 case nir_op_vec4:
2508 case nir_op_vec8:
2509 case nir_op_vec16: {
2510 bi_index unoffset_srcs[16] = {bi_null()};
2511 unsigned channels[16] = {0};
2512
2513 for (unsigned i = 0; i < srcs; ++i) {
2514 unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2515 channels[i] = instr->src[i].swizzle[0];
2516 }
2517
2518 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2519 return;
2520 }
2521
2522 case nir_op_unpack_32_2x16: {
2523 /* Should have been scalarized */
2524 assert(comps == 2 && sz == 16);
2525
2526 bi_index vec = bi_src_index(&instr->src[0].src);
2527 unsigned chan = instr->src[0].swizzle[0];
2528
2529 bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2530 return;
2531 }
2532
2533 case nir_op_unpack_64_2x32_split_x: {
2534 unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2535 bi_mov_i32_to(b, dst,
2536 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2537 return;
2538 }
2539
2540 case nir_op_unpack_64_2x32_split_y: {
2541 unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2542 bi_mov_i32_to(b, dst,
2543 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2544 return;
2545 }
2546
2547 case nir_op_pack_64_2x32_split:
2548 bi_collect_v2i32_to(b, dst,
2549 bi_extract(b, bi_src_index(&instr->src[0].src),
2550 instr->src[0].swizzle[0]),
2551 bi_extract(b, bi_src_index(&instr->src[1].src),
2552 instr->src[1].swizzle[0]));
2553 return;
2554
2555 case nir_op_pack_64_2x32:
2556 bi_collect_v2i32_to(b, dst,
2557 bi_extract(b, bi_src_index(&instr->src[0].src),
2558 instr->src[0].swizzle[0]),
2559 bi_extract(b, bi_src_index(&instr->src[0].src),
2560 instr->src[0].swizzle[1]));
2561 return;
2562
2563 case nir_op_pack_uvec2_to_uint: {
2564 bi_index src = bi_src_index(&instr->src[0].src);
2565
2566 assert(sz == 32 && src_sz == 32);
2567 bi_mkvec_v2i16_to(
2568 b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2569 bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2570 return;
2571 }
2572
2573 case nir_op_pack_uvec4_to_uint: {
2574 bi_index src = bi_src_index(&instr->src[0].src);
2575
2576 assert(sz == 32 && src_sz == 32);
2577
2578 bi_index srcs[4] = {
2579 bi_extract(b, src, instr->src[0].swizzle[0]),
2580 bi_extract(b, src, instr->src[0].swizzle[1]),
2581 bi_extract(b, src, instr->src[0].swizzle[2]),
2582 bi_extract(b, src, instr->src[0].swizzle[3]),
2583 };
2584 unsigned channels[4] = {0};
2585 bi_make_vec_to(b, dst, srcs, channels, 4, 8);
2586 return;
2587 }
2588
2589 case nir_op_mov: {
2590 bi_index idx = bi_src_index(&instr->src[0].src);
2591 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2592
2593 unsigned channels[4] = {
2594 comps > 0 ? instr->src[0].swizzle[0] : 0,
2595 comps > 1 ? instr->src[0].swizzle[1] : 0,
2596 comps > 2 ? instr->src[0].swizzle[2] : 0,
2597 comps > 3 ? instr->src[0].swizzle[3] : 0,
2598 };
2599
2600 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2601 return;
2602 }
2603
2604 case nir_op_pack_32_2x16: {
2605 assert(comps == 1);
2606
2607 bi_index idx = bi_src_index(&instr->src[0].src);
2608 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2609
2610 unsigned channels[2] = {instr->src[0].swizzle[0],
2611 instr->src[0].swizzle[1]};
2612
2613 bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2614 return;
2615 }
2616
2617 case nir_op_f2f16:
2618 case nir_op_f2f16_rtz:
2619 case nir_op_f2f16_rtne: {
2620 assert(src_sz == 32);
2621 bi_index idx = bi_src_index(&instr->src[0].src);
2622 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2623 bi_index s1 =
2624 comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2625
2626 bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2627
2628 /* Override rounding if explicitly requested. Otherwise, the
2629 * default rounding mode is selected by the builder. Depending
2630 * on the float controls required by the shader, the default
2631 * mode may not be nearest-even.
2632 */
2633 if (instr->op == nir_op_f2f16_rtz)
2634 I->round = BI_ROUND_RTZ;
2635 else if (instr->op == nir_op_f2f16_rtne)
2636 I->round = BI_ROUND_NONE; /* Nearest even */
2637
2638 return;
2639 }
2640
2641 /* Vectorized downcasts */
2642 case nir_op_u2u16:
2643 case nir_op_i2i16: {
2644 if (!(src_sz == 32 && comps == 2))
2645 break;
2646
2647 bi_index idx = bi_src_index(&instr->src[0].src);
2648 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2649 bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2650
2651 bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2652 return;
2653 }
2654
2655 /* Pre-v11, we can get vector i2f32 by lowering 32-bit vector i2f16 to
2656 * i2f32 + f2f16 in bifrost_nir_lower_algebraic_late, which runs after
2657 * nir_opt_vectorize. We don't scalarize i2f32 earlier because we have
2658 * vector V2F16_TO_V2F32. */
2659 case nir_op_i2f32:
2660 case nir_op_u2f32: {
2661 if (!(src_sz == 32 && comps == 2))
2662 break;
2663
2664 nir_alu_src *src = &instr->src[0];
2665 bi_index idx = bi_src_index(&src->src);
2666 bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2667 bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2668
2669 bi_index d0, d1;
2670 if (instr->op == nir_op_i2f32) {
2671 d0 = bi_s32_to_f32(b, s0);
2672 d1 = bi_s32_to_f32(b, s1);
2673 } else {
2674 d0 = bi_u32_to_f32(b, s0);
2675 d1 = bi_u32_to_f32(b, s1);
2676 }
2677
2678 bi_collect_v2i32_to(b, dst, d0, d1);
2679
2680 return;
2681 }
2682
2683 case nir_op_i2i8:
2684 case nir_op_u2u8: {
2685 /* Acts like an 8-bit swizzle */
2686 bi_index idx = bi_src_index(&instr->src[0].src);
2687 unsigned factor = src_sz / 8;
2688 unsigned chan[4] = {0};
2689
2690 for (unsigned i = 0; i < comps; ++i)
2691 chan[i] = instr->src[0].swizzle[i] * factor;
2692
2693 bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2694 return;
2695 }
2696
2697 case nir_op_b32csel: {
2698 if (sz != 16)
2699 break;
2700
2701 /* We allow vectorizing b32csel(cond, A, B) which can be
2702 * translated as MUX.v2i16, even though cond is a 32-bit vector.
2703 *
2704 * If the source condition vector is replicated, we can use
2705 * MUX.v2i16 directly, letting each component use the
2706 * corresponding half of the 32-bit source. NIR uses 0/~0
2707 * booleans so that's guaranteed to work (that is, 32-bit NIR
2708 * booleans are 16-bit replicated).
2709 *
2710 * If we're not replicated, we use the same trick but must
2711 * insert a MKVEC.v2i16 first to convert down to 16-bit.
2712 */
2713 bi_index idx = bi_src_index(&instr->src[0].src);
2714 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2715 bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2716 bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2717
2718 if (!bi_nir_is_replicated(&instr->src[0])) {
2719 s0 = bi_mkvec_v2i16(
2720 b, bi_half(s0, false),
2721 bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2722 }
2723
2724 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2725 return;
2726 }
2727
2728 default:
2729 break;
2730 }
2731
2732 bi_index s0 =
2733 srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2734 bi_index s1 =
2735 srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2736 bi_index s2 =
2737 srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2738
2739 switch (instr->op) {
2740 case nir_op_ffma:
2741 bi_fma_to(b, sz, dst, s0, s1, s2);
2742 break;
2743
2744 case nir_op_fmul:
2745 bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2746 break;
2747
2748 case nir_op_fadd:
2749 bi_fadd_to(b, sz, dst, s0, s1);
2750 break;
2751
2752 case nir_op_fsat: {
2753 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2754 I->clamp = BI_CLAMP_CLAMP_0_1;
2755 break;
2756 }
2757
2758 case nir_op_fsat_signed: {
2759 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2760 I->clamp = BI_CLAMP_CLAMP_M1_1;
2761 break;
2762 }
2763
2764 case nir_op_fclamp_pos: {
2765 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2766 I->clamp = BI_CLAMP_CLAMP_0_INF;
2767 break;
2768 }
2769
2770 case nir_op_fneg:
2771 bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2772 break;
2773
2774 case nir_op_fabs:
2775 bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2776 break;
2777
2778 case nir_op_fsin:
2779 bi_lower_fsincos_32(b, dst, s0, false);
2780 break;
2781
2782 case nir_op_fcos:
2783 bi_lower_fsincos_32(b, dst, s0, true);
2784 break;
2785
2786 case nir_op_fexp2:
2787 assert(sz == 32); /* should've been lowered */
2788
2789 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2790 bi_lower_fexp2_32(b, dst, s0);
2791 else
2792 bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2793
2794 break;
2795
2796 case nir_op_flog2:
2797 assert(sz == 32); /* should've been lowered */
2798
2799 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2800 bi_lower_flog2_32(b, dst, s0);
2801 else
2802 bi_flog2_32(b, dst, s0);
2803
2804 break;
2805
2806 case nir_op_fpow:
2807 assert(sz == 32); /* should've been lowered */
2808
2809 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2810 bi_lower_fpow_32(b, dst, s0, s1);
2811 else
2812 bi_fpow_32(b, dst, s0, s1);
2813
2814 break;
2815
2816 case nir_op_frexp_exp:
2817 bi_frexpe_to(b, sz, dst, s0, false, false);
2818 break;
2819
2820 case nir_op_frexp_sig:
2821 bi_frexpm_to(b, sz, dst, s0, false, false);
2822 break;
2823
2824 case nir_op_ldexp:
2825 bi_ldexp_to(b, sz, dst, s0, s1);
2826 break;
2827
2828 case nir_op_b8csel:
2829 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2830 break;
2831
2832 case nir_op_b16csel:
2833 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2834 break;
2835
2836 case nir_op_b32csel:
2837 bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2838 break;
2839
2840 case nir_op_extract_u8:
2841 case nir_op_extract_i8: {
2842 assert(comps == 1 && "should be scalarized");
2843 assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2844 unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2845
2846 if (s0.swizzle == BI_SWIZZLE_H11) {
2847 assert(byte < 2);
2848 byte += 2;
2849 } else if (s0.swizzle != BI_SWIZZLE_H01) {
2850 assert(s0.swizzle == BI_SWIZZLE_H00);
2851 }
2852
2853 assert(byte < 4);
2854
2855 s0.swizzle = BI_SWIZZLE_H01;
2856
2857 if (instr->op == nir_op_extract_i8)
2858 bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2859 else
2860 bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2861 break;
2862 }
2863
2864 case nir_op_extract_u16:
2865 case nir_op_extract_i16: {
2866 assert(comps == 1 && "should be scalarized");
2867 assert(src_sz == 32 && "should be lowered");
2868 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2869 assert(half == 0 || half == 1);
2870
2871 if (instr->op == nir_op_extract_i16)
2872 bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2873 else
2874 bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2875 break;
2876 }
2877
2878 case nir_op_insert_u16: {
2879 assert(comps == 1 && "should be scalarized");
2880 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2881 assert(half == 0 || half == 1);
2882
2883 if (half == 0)
2884 bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2885 else
2886 bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2887 break;
2888 }
2889
2890 case nir_op_ishl:
2891 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2892 break;
2893 case nir_op_ushr:
2894 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2895 break;
2896
2897 case nir_op_ishr:
2898 if (b->shader->arch >= 9)
2899 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2900 else
2901 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2902 break;
2903
2904 case nir_op_imin:
2905 case nir_op_umin:
2906 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2907 s1, BI_CMPF_LT);
2908 break;
2909
2910 case nir_op_imax:
2911 case nir_op_umax:
2912 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2913 s1, BI_CMPF_GT);
2914 break;
2915
2916 case nir_op_f2f32:
2917 bi_f16_to_f32_to(b, dst, s0);
2918 break;
2919
2920 case nir_op_fquantize2f16: {
2921 bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2922
2923 if (b->shader->arch < 9) {
2924 /* Bifrost has psuedo-ftz on conversions, that is lowered to an ftz
2925 * flag in the clause header */
2926 f16->ftz = true;
2927 } else {
2928 /* Valhall doesn't have clauses, and uses a separate flush
2929 * instruction */
2930 f16 = bi_flush_to(b, 16, bi_temp(b->shader), f16->dest[0]);
2931 f16->ftz = true;
2932 }
2933
2934 bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2935
2936 if (b->shader->arch < 9)
2937 f32->ftz = true;
2938
2939 break;
2940 }
2941
2942 case nir_op_f2i32:
2943 if (src_sz == 32)
2944 bi_f32_to_s32_to(b, dst, s0);
2945 else
2946 bi_f16_to_s32_to(b, dst, s0);
2947 break;
2948
2949 /* Note 32-bit sources => no vectorization, so 32-bit works */
2950 case nir_op_f2u16:
2951 if (src_sz == 32)
2952 bi_f32_to_u32_to(b, dst, s0);
2953 else
2954 bi_v2f16_to_v2u16_to(b, dst, s0);
2955 break;
2956
2957 case nir_op_f2i16:
2958 if (src_sz == 32)
2959 bi_f32_to_s32_to(b, dst, s0);
2960 else
2961 bi_v2f16_to_v2s16_to(b, dst, s0);
2962 break;
2963
2964 case nir_op_f2u32:
2965 if (src_sz == 32)
2966 bi_f32_to_u32_to(b, dst, s0);
2967 else
2968 bi_f16_to_u32_to(b, dst, s0);
2969 break;
2970
2971 case nir_op_u2f16:
2972 /* V2I32_TO_V2F16 does not exist */
2973 assert((src_sz == 16 || src_sz == 8) && "should be lowered");
2974
2975 if (src_sz == 16)
2976 bi_v2u16_to_v2f16_to(b, dst, s0);
2977 else if (src_sz == 8)
2978 bi_v2u8_to_v2f16_to(b, dst, s0);
2979 break;
2980
2981 case nir_op_u2f32:
2982 if (src_sz == 32)
2983 bi_u32_to_f32_to(b, dst, s0);
2984 else if (src_sz == 16)
2985 bi_u16_to_f32_to(b, dst, s0);
2986 else
2987 bi_u8_to_f32_to(b, dst, s0);
2988 break;
2989
2990 case nir_op_i2f16:
2991 /* V2I32_TO_V2F16 does not exist */
2992 assert((src_sz == 16 || src_sz == 8) && "should be lowered");
2993
2994 if (src_sz == 16)
2995 bi_v2s16_to_v2f16_to(b, dst, s0);
2996 else if (src_sz == 8)
2997 bi_v2s8_to_v2f16_to(b, dst, s0);
2998 break;
2999
3000 case nir_op_i2f32:
3001 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3002
3003 if (src_sz == 32)
3004 bi_s32_to_f32_to(b, dst, s0);
3005 else if (src_sz == 16)
3006 bi_s16_to_f32_to(b, dst, s0);
3007 else if (src_sz == 8)
3008 bi_s8_to_f32_to(b, dst, s0);
3009 break;
3010
3011 case nir_op_i2i32:
3012 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3013
3014 if (src_sz == 32)
3015 bi_mov_i32_to(b, dst, s0);
3016 else if (src_sz == 16)
3017 bi_s16_to_s32_to(b, dst, s0);
3018 else if (src_sz == 8)
3019 bi_s8_to_s32_to(b, dst, s0);
3020 break;
3021
3022 case nir_op_u2u32:
3023 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
3024
3025 if (src_sz == 32)
3026 bi_mov_i32_to(b, dst, s0);
3027 else if (src_sz == 16)
3028 bi_u16_to_u32_to(b, dst, s0);
3029 else if (src_sz == 8)
3030 bi_u8_to_u32_to(b, dst, s0);
3031
3032 break;
3033
3034 case nir_op_i2i16:
3035 assert(src_sz == 8 || src_sz == 32);
3036
3037 if (src_sz == 8)
3038 bi_v2s8_to_v2s16_to(b, dst, s0);
3039 else
3040 bi_mov_i32_to(b, dst, s0);
3041 break;
3042
3043 case nir_op_u2u16:
3044 assert(src_sz == 8 || src_sz == 32);
3045
3046 if (src_sz == 8)
3047 bi_v2u8_to_v2u16_to(b, dst, s0);
3048 else
3049 bi_mov_i32_to(b, dst, s0);
3050 break;
3051
3052 case nir_op_b2i8:
3053 case nir_op_b2i16:
3054 case nir_op_b2i32:
3055 bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
3056 BI_MUX_INT_ZERO);
3057 break;
3058
3059 case nir_op_ieq8:
3060 case nir_op_ine8:
3061 case nir_op_ilt8:
3062 case nir_op_ige8:
3063 case nir_op_ieq16:
3064 case nir_op_ine16:
3065 case nir_op_ilt16:
3066 case nir_op_ige16:
3067 case nir_op_ieq32:
3068 case nir_op_ine32:
3069 case nir_op_ilt32:
3070 case nir_op_ige32:
3071 bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
3072 BI_RESULT_TYPE_M1);
3073 break;
3074
3075 case nir_op_ult8:
3076 case nir_op_uge8:
3077 case nir_op_ult16:
3078 case nir_op_uge16:
3079 case nir_op_ult32:
3080 case nir_op_uge32:
3081 bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
3082 bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
3083 break;
3084
3085 case nir_op_feq32:
3086 case nir_op_feq16:
3087 case nir_op_flt32:
3088 case nir_op_flt16:
3089 case nir_op_fge32:
3090 case nir_op_fge16:
3091 case nir_op_fneu32:
3092 case nir_op_fneu16:
3093 bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
3094 BI_RESULT_TYPE_M1);
3095 break;
3096
3097 case nir_op_fround_even:
3098 case nir_op_fceil:
3099 case nir_op_ffloor:
3100 case nir_op_ftrunc:
3101 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
3102 break;
3103
3104 case nir_op_fmin:
3105 bi_fmin_to(b, sz, dst, s0, s1);
3106 break;
3107
3108 case nir_op_fmax:
3109 bi_fmax_to(b, sz, dst, s0, s1);
3110 break;
3111
3112 case nir_op_iadd:
3113 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
3114 break;
3115
3116 case nir_op_iadd_sat:
3117 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
3118 break;
3119
3120 case nir_op_uadd_sat:
3121 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
3122 break;
3123
3124 case nir_op_ihadd:
3125 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
3126 break;
3127
3128 case nir_op_irhadd:
3129 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
3130 break;
3131
3132 case nir_op_uhadd:
3133 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
3134 break;
3135
3136 case nir_op_urhadd:
3137 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
3138 break;
3139
3140 case nir_op_ineg:
3141 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
3142 break;
3143
3144 case nir_op_isub:
3145 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
3146 break;
3147
3148 case nir_op_isub_sat:
3149 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
3150 break;
3151
3152 case nir_op_usub_sat:
3153 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
3154 break;
3155
3156 case nir_op_imul:
3157 bi_imul_to(b, sz, dst, s0, s1);
3158 break;
3159
3160 case nir_op_iabs:
3161 bi_iabs_to(b, sz, dst, s0);
3162 break;
3163
3164 case nir_op_iand:
3165 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3166 break;
3167
3168 case nir_op_ior:
3169 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3170 break;
3171
3172 case nir_op_ixor:
3173 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3174 break;
3175
3176 case nir_op_inot:
3177 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
3178 break;
3179
3180 case nir_op_frsq:
3181 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3182 bi_lower_frsq_32(b, dst, s0);
3183 else
3184 bi_frsq_to(b, sz, dst, s0);
3185 break;
3186
3187 case nir_op_frcp:
3188 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3189 bi_lower_frcp_32(b, dst, s0);
3190 else
3191 bi_frcp_to(b, sz, dst, s0);
3192 break;
3193
3194 case nir_op_uclz:
3195 bi_clz_to(b, sz, dst, s0, false);
3196 break;
3197
3198 case nir_op_bit_count:
3199 assert(sz == 32 && src_sz == 32 && "should've been lowered");
3200 bi_popcount_i32_to(b, dst, s0);
3201 break;
3202
3203 case nir_op_bitfield_reverse:
3204 assert(sz == 32 && src_sz == 32 && "should've been lowered");
3205 bi_bitrev_i32_to(b, dst, s0);
3206 break;
3207
3208 case nir_op_ufind_msb: {
3209 bi_index clz = bi_clz(b, src_sz, s0, false);
3210
3211 if (sz == 8)
3212 clz = bi_byte(clz, 0);
3213 else if (sz == 16)
3214 clz = bi_half(clz, false);
3215
3216 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3217 break;
3218 }
3219
3220 default:
3221 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3222 unreachable("Unknown ALU op");
3223 }
3224 }
3225
3226 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
3227 * Midgard */
3228 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)3229 bifrost_tex_format(enum glsl_sampler_dim dim)
3230 {
3231 switch (dim) {
3232 case GLSL_SAMPLER_DIM_1D:
3233 case GLSL_SAMPLER_DIM_BUF:
3234 return 1;
3235
3236 case GLSL_SAMPLER_DIM_2D:
3237 case GLSL_SAMPLER_DIM_MS:
3238 case GLSL_SAMPLER_DIM_EXTERNAL:
3239 case GLSL_SAMPLER_DIM_RECT:
3240 case GLSL_SAMPLER_DIM_SUBPASS:
3241 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3242 return 2;
3243
3244 case GLSL_SAMPLER_DIM_3D:
3245 return 3;
3246
3247 case GLSL_SAMPLER_DIM_CUBE:
3248 return 0;
3249
3250 default:
3251 DBG("Unknown sampler dim type\n");
3252 assert(0);
3253 return 0;
3254 }
3255 }
3256
3257 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)3258 valhall_tex_dimension(enum glsl_sampler_dim dim)
3259 {
3260 switch (dim) {
3261 case GLSL_SAMPLER_DIM_1D:
3262 case GLSL_SAMPLER_DIM_BUF:
3263 return BI_DIMENSION_1D;
3264
3265 case GLSL_SAMPLER_DIM_2D:
3266 case GLSL_SAMPLER_DIM_MS:
3267 case GLSL_SAMPLER_DIM_EXTERNAL:
3268 case GLSL_SAMPLER_DIM_RECT:
3269 case GLSL_SAMPLER_DIM_SUBPASS:
3270 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3271 return BI_DIMENSION_2D;
3272
3273 case GLSL_SAMPLER_DIM_3D:
3274 return BI_DIMENSION_3D;
3275
3276 case GLSL_SAMPLER_DIM_CUBE:
3277 return BI_DIMENSION_CUBE;
3278
3279 default:
3280 unreachable("Unknown sampler dim type");
3281 }
3282 }
3283
3284 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)3285 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3286 {
3287 switch (T) {
3288 case nir_type_float16:
3289 return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3290 case nir_type_float32:
3291 return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3292 case nir_type_uint16:
3293 return BIFROST_TEXTURE_FORMAT_U16;
3294 case nir_type_int16:
3295 return BIFROST_TEXTURE_FORMAT_S16;
3296 case nir_type_uint32:
3297 return BIFROST_TEXTURE_FORMAT_U32;
3298 case nir_type_int32:
3299 return BIFROST_TEXTURE_FORMAT_S32;
3300 default:
3301 unreachable("Invalid type for texturing");
3302 }
3303 }
3304
3305 /* Array indices are specified as 32-bit uints, need to convert. In .z component
3306 * from NIR */
3307 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)3308 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3309 {
3310 /* For (u)int we can just passthrough */
3311 nir_alu_type base = nir_alu_type_get_base_type(T);
3312 if (base == nir_type_int || base == nir_type_uint)
3313 return idx;
3314
3315 /* Otherwise we convert */
3316 assert(T == nir_type_float32);
3317
3318 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3319 * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3320 * 0, dt - 1). So we use round RTE, clamping is handled at the data
3321 * structure level */
3322
3323 bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3324 I->round = BI_ROUND_NONE;
3325 return I->dest[0];
3326 }
3327
3328 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3329 * 16-bit 8:8 fixed-point format. We lower as:
3330 *
3331 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3332 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3333 */
3334
3335 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)3336 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3337 {
3338 /* Precompute for constant LODs to avoid general constant folding */
3339 if (lod.type == BI_INDEX_CONSTANT) {
3340 uint32_t raw = lod.value;
3341 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3342 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3343 return bi_imm_u32(s32 & 0xFFFF);
3344 }
3345
3346 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3347 * the max LOD (16 since we cap at 2^16 texture dimensions), and
3348 * preferably small to minimize precision loss */
3349 const float max_lod = 16.0;
3350
3351 bi_instr *fsat =
3352 bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
3353 bi_imm_f32(1.0f / max_lod), bi_negzero());
3354
3355 fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3356
3357 bi_index fmul =
3358 bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3359
3360 return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3361 bi_imm_u16(0));
3362 }
3363
3364 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3365 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3366 * TODO: Cube face.
3367 */
3368
3369 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3370 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3371 {
3372 return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3373 }
3374
3375 /* The hardware specifies texel offsets and multisample indices together as a
3376 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3377 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3378 * the bits we need and return that to be passed as a staging register. Else we
3379 * return 0 to avoid allocating a data register when everything is zero. */
3380
3381 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3382 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3383 {
3384 bi_index dest = bi_zero();
3385
3386 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3387 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3388 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3389 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3390 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3391 dest = bi_mkvec_v4i8(
3392 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3393 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3394 (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3395 bi_imm_u8(0));
3396 }
3397
3398 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3399 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3400 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3401 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3402 bi_imm_u8(24));
3403 }
3404
3405 return dest;
3406 }
3407
3408 /*
3409 * Valhall specifies specifies texel offsets, multisample indices, and (for
3410 * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3411 * component is either offset.z or multisample index depending on context. Build
3412 * this register.
3413 */
3414 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3415 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3416 {
3417 bi_index dest = bi_zero();
3418
3419 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3420 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3421 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3422
3423 /* Components 0-2: offsets */
3424 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3425 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3426 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3427 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3428
3429 /* No multisample index with 3D */
3430 assert((nr <= 2) || (ms_idx < 0));
3431
3432 /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3433 bi_index z = (nr > 2)
3434 ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3435 bi_imm_u8(0), bi_zero())
3436 : bi_zero();
3437
3438 dest = bi_mkvec_v2i8(
3439 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3440 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3441 }
3442
3443 /* Component 2: multisample index */
3444 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3445 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3446 dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3447 }
3448
3449 /* Component 3: 8-bit LOD */
3450 if (lod_idx >= 0 &&
3451 (!nir_src_is_const(instr->src[lod_idx].src) ||
3452 nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3453 nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3454 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3455 bi_imm_u8(24));
3456 }
3457
3458 return dest;
3459 }
3460
3461 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3462 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3463 bi_index *t)
3464 {
3465 /* Compute max { |x|, |y|, |z| } */
3466 bi_index maxxyz = bi_temp(b->shader);
3467 *face = bi_temp(b->shader);
3468
3469 bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3470 cz = bi_extract(b, coord, 2);
3471
3472 /* Use a pseudo op on Bifrost due to tuple restrictions */
3473 if (b->shader->arch <= 8) {
3474 bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3475 } else {
3476 bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3477 bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3478 }
3479
3480 /* Select coordinates */
3481 bi_index ssel =
3482 bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3483 bi_index tsel =
3484 bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3485
3486 /* The OpenGL ES specification requires us to transform an input vector
3487 * (x, y, z) to the coordinate, given the selected S/T:
3488 *
3489 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3490 *
3491 * We implement (s shown, t similar) in a form friendlier to FMA
3492 * instructions, and clamp coordinates at the end for correct
3493 * NaN/infinity handling:
3494 *
3495 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3496 *
3497 * Take the reciprocal of max{x, y, z}
3498 */
3499 bi_index rcp = bi_frcp_f32(b, maxxyz);
3500
3501 /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3502 bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3503
3504 /* Transform the coordinates */
3505 *s = bi_temp(b->shader);
3506 *t = bi_temp(b->shader);
3507
3508 bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3509 bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3510
3511 S->clamp = BI_CLAMP_CLAMP_0_1;
3512 T->clamp = BI_CLAMP_CLAMP_0_1;
3513 }
3514
3515 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3516 * 32-bits in passed pointer t. The packing of the face with the S coordinate
3517 * exploits the redundancy of floating points with the range restriction of
3518 * CUBEFACE output.
3519 *
3520 * struct cube_map_descriptor {
3521 * float s : 29;
3522 * unsigned face : 3;
3523 * float t : 32;
3524 * }
3525 *
3526 * Since the cube face index is preshifted, this is easy to pack with a bitwise
3527 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3528 * bits from face.
3529 */
3530
3531 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3532 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3533 {
3534 bi_index face, s;
3535 bi_emit_cube_coord(b, coord, &face, &s, t);
3536 bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3537 return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3538 }
3539
3540 /* Map to the main texture op used. Some of these (txd in particular) will
3541 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3542 * sequence). We assume that lowering is handled elsewhere.
3543 */
3544
3545 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3546 bi_tex_op(nir_texop op)
3547 {
3548 switch (op) {
3549 case nir_texop_tex:
3550 case nir_texop_txb:
3551 case nir_texop_txl:
3552 case nir_texop_txd:
3553 return BIFROST_TEX_OP_TEX;
3554 case nir_texop_txf:
3555 case nir_texop_txf_ms:
3556 case nir_texop_tg4:
3557 return BIFROST_TEX_OP_FETCH;
3558 case nir_texop_lod:
3559 return BIFROST_TEX_OP_GRDESC;
3560 case nir_texop_txs:
3561 case nir_texop_query_levels:
3562 case nir_texop_texture_samples:
3563 case nir_texop_samples_identical:
3564 unreachable("should've been lowered");
3565 default:
3566 unreachable("unsupported tex op");
3567 }
3568 }
3569
3570 /* Data registers required by texturing in the order they appear. All are
3571 * optional, the texture operation descriptor determines which are present.
3572 * Note since 3D arrays are not permitted at an API level, Z_COORD and
3573 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3574
3575 enum bifrost_tex_dreg {
3576 BIFROST_TEX_DREG_Z_COORD = 0,
3577 BIFROST_TEX_DREG_Y_DELTAS = 1,
3578 BIFROST_TEX_DREG_LOD = 2,
3579 BIFROST_TEX_DREG_GRDESC_HI = 3,
3580 BIFROST_TEX_DREG_SHADOW = 4,
3581 BIFROST_TEX_DREG_ARRAY = 5,
3582 BIFROST_TEX_DREG_OFFSETMS = 6,
3583 BIFROST_TEX_DREG_SAMPLER = 7,
3584 BIFROST_TEX_DREG_TEXTURE = 8,
3585 BIFROST_TEX_DREG_COUNT,
3586 };
3587
3588 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3589 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3590 {
3591 struct bifrost_texture_operation desc = {
3592 .op = bi_tex_op(instr->op),
3593 .offset_or_bias_disable = false, /* TODO */
3594 .shadow_or_clamp_disable = instr->is_shadow,
3595 .array = instr->is_array && instr->op != nir_texop_lod,
3596 .dimension = bifrost_tex_format(instr->sampler_dim),
3597 .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3598 BI_CLAMP_NONE), /* TODO */
3599 .mask = 0xF,
3600 };
3601
3602 switch (desc.op) {
3603 case BIFROST_TEX_OP_TEX:
3604 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3605 break;
3606 case BIFROST_TEX_OP_FETCH:
3607 desc.lod_or_fetch = (enum bifrost_lod_mode)(
3608 instr->op == nir_texop_tg4
3609 ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3610 : BIFROST_TEXTURE_FETCH_TEXEL);
3611 break;
3612 case BIFROST_TEX_OP_GRDESC:
3613 break;
3614 default:
3615 unreachable("texture op unsupported");
3616 }
3617
3618 /* 32-bit indices to be allocated as consecutive staging registers */
3619 bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3620 bi_index cx = bi_null(), cy = bi_null();
3621 bi_index ddx = bi_null();
3622 bi_index ddy = bi_null();
3623
3624 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3625 bi_index index = bi_src_index(&instr->src[i].src);
3626 unsigned sz = nir_src_bit_size(instr->src[i].src);
3627 unsigned components = nir_src_num_components(instr->src[i].src);
3628 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3629 nir_alu_type T = base | sz;
3630
3631 switch (instr->src[i].src_type) {
3632 case nir_tex_src_coord:
3633 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3634 cx = bi_emit_texc_cube_coord(b, index, &cy);
3635 } else {
3636 /* Copy XY (for 2D+) or XX (for 1D) */
3637 cx = bi_extract(b, index, 0);
3638 cy = bi_extract(b, index, MIN2(1, components - 1));
3639
3640 assert(components >= 1 && components <= 3);
3641
3642 if (components == 3 && !desc.array) {
3643 /* 3D */
3644 dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3645 }
3646 }
3647
3648 if (desc.array) {
3649 dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3650 b, bi_extract(b, index, components - 1), T);
3651 }
3652
3653 break;
3654
3655 case nir_tex_src_lod:
3656 if (desc.op == BIFROST_TEX_OP_TEX &&
3657 nir_src_is_const(instr->src[i].src) &&
3658 nir_src_as_uint(instr->src[i].src) == 0) {
3659 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3660 } else if (desc.op == BIFROST_TEX_OP_TEX) {
3661 assert(base == nir_type_float);
3662
3663 assert(sz == 16 || sz == 32);
3664 dregs[BIFROST_TEX_DREG_LOD] =
3665 bi_emit_texc_lod_88(b, index, sz == 16);
3666 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3667 } else {
3668 assert(desc.op == BIFROST_TEX_OP_FETCH);
3669 assert(base == nir_type_uint || base == nir_type_int);
3670 assert(sz == 16 || sz == 32);
3671
3672 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3673 }
3674
3675 break;
3676
3677 case nir_tex_src_ddx:
3678 ddx = index;
3679 break;
3680
3681 case nir_tex_src_ddy:
3682 ddy = index;
3683 break;
3684
3685 case nir_tex_src_bias:
3686 /* Upper 16-bits interpreted as a clamp, leave zero */
3687 assert(desc.op == BIFROST_TEX_OP_TEX);
3688 assert(base == nir_type_float);
3689 assert(sz == 16 || sz == 32);
3690 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3691 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3692 break;
3693
3694 case nir_tex_src_ms_index:
3695 case nir_tex_src_offset:
3696 if (desc.offset_or_bias_disable)
3697 break;
3698
3699 dregs[BIFROST_TEX_DREG_OFFSETMS] =
3700 bi_emit_texc_offset_ms_index(b, instr);
3701 if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3702 desc.offset_or_bias_disable = true;
3703 break;
3704
3705 case nir_tex_src_comparator:
3706 dregs[BIFROST_TEX_DREG_SHADOW] = index;
3707 break;
3708
3709 case nir_tex_src_texture_offset:
3710 dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3711 break;
3712
3713 case nir_tex_src_sampler_offset:
3714 dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3715 break;
3716
3717 default:
3718 unreachable("Unhandled src type in texc emit");
3719 }
3720 }
3721
3722 if (desc.op == BIFROST_TEX_OP_FETCH &&
3723 bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3724 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3725 }
3726
3727 /* Choose an index mode */
3728
3729 bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3730 bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3731 bool direct = direct_tex && direct_samp;
3732
3733 desc.immediate_indices =
3734 direct && (instr->sampler_index < 16 && instr->texture_index < 128);
3735
3736 if (desc.immediate_indices) {
3737 desc.sampler_index_or_mode = instr->sampler_index;
3738 desc.index = instr->texture_index;
3739 } else {
3740 unsigned mode = 0;
3741
3742 if (direct && instr->sampler_index == instr->texture_index &&
3743 instr->sampler_index < 128) {
3744 mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3745 desc.index = instr->texture_index;
3746 } else if (direct && instr->sampler_index < 128) {
3747 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3748 desc.index = instr->sampler_index;
3749 dregs[BIFROST_TEX_DREG_TEXTURE] =
3750 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3751 } else if (direct_tex && instr->texture_index < 128) {
3752 mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3753 desc.index = instr->texture_index;
3754
3755 if (direct_samp) {
3756 dregs[BIFROST_TEX_DREG_SAMPLER] =
3757 bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3758 }
3759 } else if (direct_samp && instr->sampler_index < 128) {
3760 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3761 desc.index = instr->sampler_index;
3762
3763 if (direct_tex) {
3764 dregs[BIFROST_TEX_DREG_TEXTURE] =
3765 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3766 }
3767 } else {
3768 mode = BIFROST_INDEX_REGISTER;
3769
3770 if (direct_tex) {
3771 dregs[BIFROST_TEX_DREG_TEXTURE] =
3772 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3773 }
3774
3775 if (direct_samp) {
3776 dregs[BIFROST_TEX_DREG_SAMPLER] =
3777 bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3778 }
3779 }
3780
3781 mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3782 desc.sampler_index_or_mode = mode;
3783 }
3784
3785 if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
3786 assert(!bi_is_null(ddx) && !bi_is_null(ddy));
3787 struct bifrost_texture_operation gropdesc = {
3788 .sampler_index_or_mode = desc.sampler_index_or_mode,
3789 .index = desc.index,
3790 .immediate_indices = desc.immediate_indices,
3791 .op = BIFROST_TEX_OP_GRDESC_DER,
3792 .offset_or_bias_disable = true,
3793 .shadow_or_clamp_disable = true,
3794 .array = false,
3795 .dimension = desc.dimension,
3796 .format = desc.format,
3797 .mask = desc.mask,
3798 };
3799
3800 unsigned coords_comp_count =
3801 instr->coord_components -
3802 (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
3803 bi_index derivs[4];
3804 unsigned sr_count = 0;
3805
3806 if (coords_comp_count > 2)
3807 derivs[sr_count++] = bi_extract(b, ddx, 2);
3808 derivs[sr_count++] = bi_extract(b, ddy, 0);
3809 if (coords_comp_count > 1)
3810 derivs[sr_count++] = bi_extract(b, ddy, 1);
3811 if (coords_comp_count > 2)
3812 derivs[sr_count++] = bi_extract(b, ddy, 2);
3813
3814 bi_index derivs_packed = bi_temp(b->shader);
3815 bi_make_vec_to(b, derivs_packed, derivs, NULL, sr_count, 32);
3816 bi_index grdesc = bi_temp(b->shader);
3817 bi_instr *I =
3818 bi_texc_to(b, grdesc, derivs_packed, bi_extract(b, ddx, 0),
3819 coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_zero(),
3820 bi_imm_u32(gropdesc.packed), true, sr_count, 0);
3821 I->register_format = BI_REGISTER_FORMAT_U32;
3822
3823 bi_emit_cached_split_i32(b, grdesc, 4);
3824
3825 dregs[BIFROST_TEX_DREG_LOD] = bi_extract(b, grdesc, 0);
3826 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3827 }
3828
3829 /* Allocate staging registers contiguously by compacting the array. */
3830 unsigned sr_count = 0;
3831
3832 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3833 if (!bi_is_null(dregs[i]))
3834 dregs[sr_count++] = dregs[i];
3835 }
3836
3837 unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3838
3839 bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3840
3841 if (sr_count)
3842 bi_emit_collect_to(b, sr, dregs, sr_count);
3843
3844 if (instr->op == nir_texop_lod) {
3845 assert(instr->def.num_components == 2 && instr->def.bit_size == 32);
3846
3847 bi_index res[2];
3848 for (unsigned i = 0; i < 2; i++) {
3849 desc.shadow_or_clamp_disable = i != 0;
3850
3851 bi_index grdesc = bi_temp(b->shader);
3852 bi_instr *I = bi_texc_to(b, grdesc, sr, cx, cy,
3853 bi_imm_u32(desc.packed), false, sr_count, 0);
3854 I->register_format = BI_REGISTER_FORMAT_U32;
3855
3856 bi_emit_cached_split_i32(b, grdesc, 4);
3857
3858 bi_index lod = bi_s16_to_f32(b, bi_half(bi_extract(b, grdesc, 0), 0));
3859
3860 lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));
3861
3862 if (i == 0)
3863 lod = bi_fround_f32(b, lod, BI_ROUND_NONE);
3864
3865 res[i] = lod;
3866 }
3867
3868 bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
3869 return;
3870 }
3871
3872 bi_index dst = bi_temp(b->shader);
3873
3874 bi_instr *I =
3875 bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc.packed),
3876 !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3877 I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3878
3879 bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3880 bi_emit_split_i32(b, w, dst, res_size);
3881 bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3882 DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3883 }
3884
3885 /* Staging registers required by texturing in the order they appear (Valhall) */
3886
3887 enum valhall_tex_sreg {
3888 VALHALL_TEX_SREG_X_COORD = 0,
3889 VALHALL_TEX_SREG_Y_COORD = 1,
3890 VALHALL_TEX_SREG_Z_COORD = 2,
3891 VALHALL_TEX_SREG_Y_DELTAS = 3,
3892 VALHALL_TEX_SREG_ARRAY = 4,
3893 VALHALL_TEX_SREG_SHADOW = 5,
3894 VALHALL_TEX_SREG_OFFSETMS = 6,
3895 VALHALL_TEX_SREG_LOD = 7,
3896 VALHALL_TEX_SREG_GRDESC0 = 8,
3897 VALHALL_TEX_SREG_GRDESC1 = 9,
3898 VALHALL_TEX_SREG_COUNT,
3899 };
3900
3901 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3902 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3903 {
3904 bool explicit_offset = false;
3905 enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3906
3907 bool has_lod_mode = (instr->op == nir_texop_tex) ||
3908 (instr->op == nir_texop_txl) ||
3909 (instr->op == nir_texop_txd) ||
3910 (instr->op == nir_texop_txb);
3911
3912 /* 32-bit indices to be allocated as consecutive staging registers */
3913 bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3914 bi_index sampler = bi_imm_u32(instr->sampler_index);
3915 bi_index texture = bi_imm_u32(instr->texture_index);
3916 bi_index ddx = bi_null();
3917 bi_index ddy = bi_null();
3918
3919 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3920 bi_index index = bi_src_index(&instr->src[i].src);
3921 unsigned sz = nir_src_bit_size(instr->src[i].src);
3922
3923 switch (instr->src[i].src_type) {
3924 case nir_tex_src_coord: {
3925 bool is_array = instr->is_array && instr->op != nir_texop_lod;
3926 unsigned components = nir_tex_instr_src_size(instr, i) - is_array;
3927
3928 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3929 sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3930 b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3931 } else {
3932 assert(components >= 1 && components <= 3);
3933
3934 /* Copy XY (for 2D+) or XX (for 1D) */
3935 sregs[VALHALL_TEX_SREG_X_COORD] = index;
3936
3937 if (components >= 2)
3938 sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3939
3940 if (components == 3)
3941 sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3942 }
3943
3944 if (is_array)
3945 sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components);
3946
3947 break;
3948 }
3949
3950 case nir_tex_src_lod:
3951 if (nir_src_is_const(instr->src[i].src) &&
3952 nir_src_as_uint(instr->src[i].src) == 0) {
3953 lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3954 } else if (has_lod_mode) {
3955 lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3956
3957 assert(sz == 16 || sz == 32);
3958 sregs[VALHALL_TEX_SREG_LOD] =
3959 bi_emit_texc_lod_88(b, index, sz == 16);
3960 }
3961 break;
3962
3963 case nir_tex_src_ddx:
3964 ddx = index;
3965 break;
3966
3967 case nir_tex_src_ddy:
3968 ddy = index;
3969 break;
3970
3971 case nir_tex_src_bias:
3972 /* Upper 16-bits interpreted as a clamp, leave zero */
3973 assert(sz == 16 || sz == 32);
3974 sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3975
3976 lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3977 break;
3978 case nir_tex_src_ms_index:
3979 case nir_tex_src_offset:
3980 /* Handled below */
3981 break;
3982
3983 case nir_tex_src_comparator:
3984 sregs[VALHALL_TEX_SREG_SHADOW] = index;
3985 break;
3986
3987 case nir_tex_src_texture_offset:
3988 /* This should always be 0 as lower_index_to_offset is expected to be
3989 * set */
3990 assert(instr->texture_index == 0);
3991 texture = index;
3992 break;
3993
3994 case nir_tex_src_sampler_offset:
3995 /* This should always be 0 as lower_index_to_offset is expected to be
3996 * set */
3997 assert(instr->sampler_index == 0);
3998 sampler = index;
3999 break;
4000
4001 default:
4002 unreachable("Unhandled src type in tex emit");
4003 }
4004 }
4005
4006 /* Generate packed offset + ms index + LOD register. These default to
4007 * zero so we only need to encode if these features are actually in use.
4008 */
4009 bi_index offsets = bi_emit_valhall_offsets(b, instr);
4010
4011 if (!bi_is_equiv(offsets, bi_zero())) {
4012 sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
4013 explicit_offset = true;
4014 }
4015
4016 bool narrow_indices = va_is_valid_const_narrow_index(texture) &&
4017 va_is_valid_const_narrow_index(sampler);
4018
4019 bi_index src0;
4020 bi_index src1;
4021
4022 if (narrow_indices) {
4023 unsigned tex_set =
4024 va_res_fold_table_idx(pan_res_handle_get_table(texture.value));
4025 unsigned sampler_set =
4026 va_res_fold_table_idx(pan_res_handle_get_table(sampler.value));
4027 unsigned texture_index = pan_res_handle_get_index(texture.value);
4028 unsigned sampler_index = pan_res_handle_get_index(sampler.value);
4029
4030 unsigned packed_handle = (tex_set << 27) | (texture_index << 16) |
4031 (sampler_set << 11) | sampler_index;
4032
4033 src0 = bi_imm_u32(packed_handle);
4034
4035 /* TODO: narrow offsetms */
4036 src1 = bi_zero();
4037 } else {
4038 src0 = sampler;
4039 src1 = texture;
4040 }
4041
4042 enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
4043
4044 if (!bi_is_null(ddx) || !bi_is_null(ddy)) {
4045 unsigned coords_comp_count =
4046 instr->coord_components -
4047 (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE);
4048 assert(!bi_is_null(ddx) && !bi_is_null(ddy));
4049
4050 lod_mode = BI_VA_LOD_MODE_GRDESC;
4051
4052 bi_index derivs[6] = {
4053 bi_extract(b, ddx, 0),
4054 bi_extract(b, ddy, 0),
4055 coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_null(),
4056 coords_comp_count > 1 ? bi_extract(b, ddy, 1) : bi_null(),
4057 coords_comp_count > 2 ? bi_extract(b, ddx, 2) : bi_null(),
4058 coords_comp_count > 2 ? bi_extract(b, ddy, 2) : bi_null(),
4059 };
4060 bi_index derivs_packed = bi_temp(b->shader);
4061 bi_make_vec_to(b, derivs_packed, derivs, NULL, coords_comp_count * 2, 32);
4062 bi_index grdesc = bi_temp(b->shader);
4063 bi_instr *I = bi_tex_gradient_to(b, grdesc, derivs_packed, src0, src1, dim,
4064 !narrow_indices, 3, coords_comp_count * 2);
4065 I->derivative_enable = true;
4066 I->force_delta_enable = false;
4067 I->lod_clamp_disable = true;
4068 I->lod_bias_disable = true;
4069 I->register_format = BI_REGISTER_FORMAT_U32;
4070
4071 bi_emit_cached_split_i32(b, grdesc, 2);
4072 sregs[VALHALL_TEX_SREG_GRDESC0] = bi_extract(b, grdesc, 0);
4073 sregs[VALHALL_TEX_SREG_GRDESC1] = bi_extract(b, grdesc, 1);
4074 }
4075
4076 /* Allocate staging registers contiguously by compacting the array. */
4077 unsigned sr_count = 0;
4078 for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
4079 if (!bi_is_null(sregs[i]))
4080 sregs[sr_count++] = sregs[i];
4081 }
4082
4083 bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
4084
4085 if (sr_count)
4086 bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
4087
4088 if (instr->op == nir_texop_lod) {
4089 assert(instr->def.num_components == 2 && instr->def.bit_size == 32);
4090
4091 bi_index res[2];
4092
4093 for (unsigned i = 0; i < 2; i++) {
4094 bi_index grdesc = bi_temp(b->shader);
4095 bi_instr *I = bi_tex_gradient_to(b, grdesc, idx, src0, src1, dim,
4096 !narrow_indices, 1, sr_count);
4097 I->derivative_enable = false;
4098 I->force_delta_enable = true;
4099 I->lod_clamp_disable = i != 0;
4100 I->register_format = BI_REGISTER_FORMAT_U32;
4101 bi_index lod = bi_s16_to_f32(b, bi_half(grdesc, 0));
4102
4103 lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256));
4104
4105 if (i == 0)
4106 lod = bi_fround_f32(b, lod, BI_ROUND_NONE);
4107
4108 res[i] = lod;
4109 }
4110
4111 bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32);
4112 return;
4113 }
4114
4115 /* Only write the components that we actually read */
4116 unsigned mask = nir_def_components_read(&instr->def);
4117 unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
4118 unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
4119
4120 enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
4121 bi_index dest = bi_temp(b->shader);
4122
4123 switch (instr->op) {
4124 case nir_texop_tex:
4125 case nir_texop_txb:
4126 case nir_texop_txl:
4127 case nir_texop_txd:
4128 bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
4129 instr->is_shadow, explicit_offset, lod_mode,
4130 !narrow_indices, mask, sr_count);
4131 break;
4132 case nir_texop_txf:
4133 case nir_texop_txf_ms: {
4134 /* On Valhall, TEX_FETCH doesn't have CUBE support. This is not a problem
4135 * as a cube is just a 2D array in any cases. */
4136 if (dim == BI_DIMENSION_CUBE)
4137 dim = BI_DIMENSION_2D;
4138
4139 bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
4140 explicit_offset, !narrow_indices, mask, sr_count);
4141 break;
4142 }
4143 case nir_texop_tg4:
4144 bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
4145 instr->component, false, regfmt, instr->is_shadow,
4146 explicit_offset, !narrow_indices, mask, sr_count);
4147 break;
4148 default:
4149 unreachable("Unhandled Valhall texture op");
4150 }
4151
4152 /* The hardware will write only what we read, and it will into
4153 * contiguous registers without gaps (different from Bifrost). NIR
4154 * expects the gaps, so fill in the holes (they'll be copypropped and
4155 * DCE'd away later).
4156 */
4157 bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
4158
4159 bi_emit_cached_split_i32(b, dest, res_size);
4160
4161 /* Index into the packed component array */
4162 unsigned j = 0;
4163 unsigned comps[4] = {0};
4164 unsigned nr_components = instr->def.num_components;
4165
4166 for (unsigned i = 0; i < nr_components; ++i) {
4167 if (mask & BITFIELD_BIT(i)) {
4168 unpacked[i] = dest;
4169 comps[i] = j++;
4170 } else {
4171 unpacked[i] = bi_zero();
4172 }
4173 }
4174
4175 bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
4176 instr->def.num_components, instr->def.bit_size);
4177 }
4178
4179 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
4180 * textures with sufficiently small immediate indices. Anything else
4181 * needs a complete texture op. */
4182
4183 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)4184 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
4185 {
4186 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
4187 assert(coord_idx >= 0);
4188 bi_index coords = bi_src_index(&instr->src[coord_idx].src);
4189
4190 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
4191 bi_index face, s, t;
4192 bi_emit_cube_coord(b, coords, &face, &s, &t);
4193
4194 bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
4195 face, instr->sampler_index, instr->texture_index);
4196 } else {
4197 bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
4198 bi_extract(b, coords, 0), bi_extract(b, coords, 1),
4199 instr->op != nir_texop_tex, /* zero LOD */
4200 instr->sampler_index, instr->texture_index);
4201 }
4202
4203 bi_split_def(b, &instr->def);
4204 }
4205
4206 static bool
bi_is_simple_tex(nir_tex_instr * instr)4207 bi_is_simple_tex(nir_tex_instr *instr)
4208 {
4209 if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
4210 return false;
4211
4212 if (instr->dest_type != nir_type_float32 &&
4213 instr->dest_type != nir_type_float16)
4214 return false;
4215
4216 if (instr->is_shadow || instr->is_array)
4217 return false;
4218
4219 switch (instr->sampler_dim) {
4220 case GLSL_SAMPLER_DIM_2D:
4221 case GLSL_SAMPLER_DIM_EXTERNAL:
4222 case GLSL_SAMPLER_DIM_RECT:
4223 break;
4224
4225 case GLSL_SAMPLER_DIM_CUBE:
4226 /* LOD can't be specified with TEXS_CUBE */
4227 if (instr->op == nir_texop_txl)
4228 return false;
4229 break;
4230
4231 default:
4232 return false;
4233 }
4234
4235 for (unsigned i = 0; i < instr->num_srcs; ++i) {
4236 if (instr->src[i].src_type != nir_tex_src_lod &&
4237 instr->src[i].src_type != nir_tex_src_coord)
4238 return false;
4239 }
4240
4241 /* Indices need to fit in provided bits */
4242 unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
4243 if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
4244 return false;
4245
4246 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
4247 if (lod_idx < 0)
4248 return true;
4249
4250 nir_src lod = instr->src[lod_idx].src;
4251 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
4252 }
4253
4254 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)4255 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
4256 {
4257 /* If txf is used, we assume there is a valid sampler bound at index 0. Use
4258 * it for txf operations, since there may be no other valid samplers. This is
4259 * a workaround: txf does not require a sampler in NIR (so sampler_index is
4260 * undefined) but we need one in the hardware. This is ABI with the driver.
4261 *
4262 * On Valhall, as the descriptor table is encoded in the index, this should
4263 * be handled by the driver.
4264 */
4265 if (!nir_tex_instr_need_sampler(instr) && b->shader->arch < 9)
4266 instr->sampler_index = 0;
4267
4268 if (b->shader->arch >= 9)
4269 bi_emit_tex_valhall(b, instr);
4270 else if (bi_is_simple_tex(instr))
4271 bi_emit_texs(b, instr);
4272 else
4273 bi_emit_texc(b, instr);
4274 }
4275
4276 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)4277 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
4278 {
4279 unsigned nr_srcs = exec_list_length(&instr->srcs);
4280 bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
4281
4282 /* Deferred */
4283 I->phi = instr;
4284 }
4285
4286 /* Look up the AGX block corresponding to a given NIR block. Used when
4287 * translating phi nodes after emitting all blocks.
4288 */
4289 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)4290 bi_from_nir_block(bi_context *ctx, nir_block *block)
4291 {
4292 return ctx->indexed_nir_blocks[block->index];
4293 }
4294
4295 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)4296 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
4297 {
4298 nir_phi_instr *phi = I->phi;
4299
4300 /* Guaranteed by lower_phis_to_scalar */
4301 assert(phi->def.num_components == 1);
4302
4303 nir_foreach_phi_src(src, phi) {
4304 bi_block *pred = bi_from_nir_block(ctx, src->pred);
4305 unsigned i = bi_predecessor_index(block, pred);
4306 assert(i < I->nr_srcs);
4307
4308 I->src[i] = bi_src_index(&src->src);
4309 }
4310
4311 I->phi = NULL;
4312 }
4313
4314 static void
bi_emit_phis_deferred(bi_context * ctx)4315 bi_emit_phis_deferred(bi_context *ctx)
4316 {
4317 bi_foreach_block(ctx, block) {
4318 bi_foreach_instr_in_block(block, I) {
4319 if (I->op == BI_OPCODE_PHI)
4320 bi_emit_phi_deferred(ctx, block, I);
4321 }
4322 }
4323 }
4324
4325 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)4326 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
4327 {
4328 switch (instr->type) {
4329 case nir_instr_type_load_const:
4330 bi_emit_load_const(b, nir_instr_as_load_const(instr));
4331 break;
4332
4333 case nir_instr_type_intrinsic:
4334 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
4335 break;
4336
4337 case nir_instr_type_alu:
4338 bi_emit_alu(b, nir_instr_as_alu(instr));
4339 break;
4340
4341 case nir_instr_type_tex:
4342 bi_emit_tex(b, nir_instr_as_tex(instr));
4343 break;
4344
4345 case nir_instr_type_jump:
4346 bi_emit_jump(b, nir_instr_as_jump(instr));
4347 break;
4348
4349 case nir_instr_type_phi:
4350 bi_emit_phi(b, nir_instr_as_phi(instr));
4351 break;
4352
4353 default:
4354 unreachable("should've been lowered");
4355 }
4356 }
4357
4358 static bi_block *
create_empty_block(bi_context * ctx)4359 create_empty_block(bi_context *ctx)
4360 {
4361 bi_block *blk = rzalloc(ctx, bi_block);
4362
4363 util_dynarray_init(&blk->predecessors, blk);
4364
4365 return blk;
4366 }
4367
4368 static bi_block *
emit_block(bi_context * ctx,nir_block * block)4369 emit_block(bi_context *ctx, nir_block *block)
4370 {
4371 if (ctx->after_block) {
4372 ctx->current_block = ctx->after_block;
4373 ctx->after_block = NULL;
4374 } else {
4375 ctx->current_block = create_empty_block(ctx);
4376 }
4377
4378 list_addtail(&ctx->current_block->link, &ctx->blocks);
4379 list_inithead(&ctx->current_block->instructions);
4380
4381 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4382
4383 ctx->indexed_nir_blocks[block->index] = ctx->current_block;
4384
4385 nir_foreach_instr(instr, block) {
4386 bi_emit_instr(&_b, instr);
4387 }
4388
4389 return ctx->current_block;
4390 }
4391
4392 static void
emit_if(bi_context * ctx,nir_if * nif)4393 emit_if(bi_context *ctx, nir_if *nif)
4394 {
4395 bi_block *before_block = ctx->current_block;
4396
4397 /* Speculatively emit the branch, but we can't fill it in until later */
4398 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4399 bi_instr *then_branch =
4400 bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
4401 bi_zero(), BI_CMPF_EQ);
4402
4403 /* Emit the two subblocks. */
4404 bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
4405 bi_block *end_then_block = ctx->current_block;
4406
4407 /* Emit second block */
4408
4409 bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
4410 bi_block *end_else_block = ctx->current_block;
4411 ctx->after_block = create_empty_block(ctx);
4412
4413 /* Now that we have the subblocks emitted, fix up the branches */
4414
4415 assert(then_block);
4416 assert(else_block);
4417
4418 then_branch->branch_target = else_block;
4419
4420 /* Emit a jump from the end of the then block to the end of the else */
4421 _b.cursor = bi_after_block(end_then_block);
4422 bi_instr *then_exit = bi_jump(&_b, bi_zero());
4423 then_exit->branch_target = ctx->after_block;
4424
4425 bi_block_add_successor(end_then_block, then_exit->branch_target);
4426 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
4427
4428 bi_block_add_successor(before_block,
4429 then_branch->branch_target); /* then_branch */
4430 bi_block_add_successor(before_block, then_block); /* fallthrough */
4431 }
4432
4433 static void
emit_loop(bi_context * ctx,nir_loop * nloop)4434 emit_loop(bi_context *ctx, nir_loop *nloop)
4435 {
4436 assert(!nir_loop_has_continue_construct(nloop));
4437
4438 /* Remember where we are */
4439 bi_block *start_block = ctx->current_block;
4440
4441 bi_block *saved_break = ctx->break_block;
4442 bi_block *saved_continue = ctx->continue_block;
4443
4444 ctx->continue_block = create_empty_block(ctx);
4445 ctx->break_block = create_empty_block(ctx);
4446 ctx->after_block = ctx->continue_block;
4447 ctx->after_block->loop_header = true;
4448
4449 /* Emit the body itself */
4450 emit_cf_list(ctx, &nloop->body);
4451
4452 /* Branch back to loop back */
4453 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4454 bi_instr *I = bi_jump(&_b, bi_zero());
4455 I->branch_target = ctx->continue_block;
4456 bi_block_add_successor(start_block, ctx->continue_block);
4457 bi_block_add_successor(ctx->current_block, ctx->continue_block);
4458
4459 ctx->after_block = ctx->break_block;
4460
4461 /* Pop off */
4462 ctx->break_block = saved_break;
4463 ctx->continue_block = saved_continue;
4464 ++ctx->loop_count;
4465 }
4466
4467 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)4468 emit_cf_list(bi_context *ctx, struct exec_list *list)
4469 {
4470 bi_block *start_block = NULL;
4471
4472 foreach_list_typed(nir_cf_node, node, node, list) {
4473 switch (node->type) {
4474 case nir_cf_node_block: {
4475 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4476
4477 if (!start_block)
4478 start_block = block;
4479
4480 break;
4481 }
4482
4483 case nir_cf_node_if:
4484 emit_if(ctx, nir_cf_node_as_if(node));
4485 break;
4486
4487 case nir_cf_node_loop:
4488 emit_loop(ctx, nir_cf_node_as_loop(node));
4489 break;
4490
4491 default:
4492 unreachable("Unknown control flow");
4493 }
4494 }
4495
4496 return start_block;
4497 }
4498
4499 /* shader-db stuff */
4500
4501 struct bi_stats {
4502 unsigned nr_clauses, nr_tuples, nr_ins;
4503 unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4504 };
4505
4506 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)4507 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4508 {
4509 /* Count instructions */
4510 stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4511
4512 /* Non-message passing tuples are always arithmetic */
4513 if (tuple->add != clause->message) {
4514 stats->nr_arith++;
4515 return;
4516 }
4517
4518 /* Message + FMA we'll count as arithmetic _and_ message */
4519 if (tuple->fma)
4520 stats->nr_arith++;
4521
4522 switch (clause->message_type) {
4523 case BIFROST_MESSAGE_VARYING:
4524 /* Check components interpolated */
4525 stats->nr_varying +=
4526 (clause->message->vecsize + 1) *
4527 (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4528 break;
4529
4530 case BIFROST_MESSAGE_VARTEX:
4531 /* 2 coordinates, fp32 each */
4532 stats->nr_varying += (2 * 2);
4533 FALLTHROUGH;
4534 case BIFROST_MESSAGE_TEX:
4535 stats->nr_texture++;
4536 break;
4537
4538 case BIFROST_MESSAGE_ATTRIBUTE:
4539 case BIFROST_MESSAGE_LOAD:
4540 case BIFROST_MESSAGE_STORE:
4541 case BIFROST_MESSAGE_ATOMIC:
4542 stats->nr_ldst++;
4543 break;
4544
4545 case BIFROST_MESSAGE_NONE:
4546 case BIFROST_MESSAGE_BARRIER:
4547 case BIFROST_MESSAGE_BLEND:
4548 case BIFROST_MESSAGE_TILE:
4549 case BIFROST_MESSAGE_Z_STENCIL:
4550 case BIFROST_MESSAGE_ATEST:
4551 case BIFROST_MESSAGE_JOB:
4552 case BIFROST_MESSAGE_64BIT:
4553 /* Nothing to do */
4554 break;
4555 };
4556 }
4557
4558 /*
4559 * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4560 * shader completes. These costs are not accounted for in the general cycle
4561 * counts, so this function calculates the effective cost of these messages, as
4562 * if they were executed by shader code.
4563 */
4564 static unsigned
bi_count_preload_cost(bi_context * ctx)4565 bi_count_preload_cost(bi_context *ctx)
4566 {
4567 /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4568 * 16 fp16 varying components per cycle or fetch two texels per cycle.
4569 */
4570 unsigned cost = 0;
4571
4572 for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4573 struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4574
4575 if (msg.enabled && msg.texture) {
4576 /* 2 coordinate, 2 half-words each, plus texture */
4577 cost += 12;
4578 } else if (msg.enabled) {
4579 cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4580 }
4581 }
4582
4583 return cost;
4584 }
4585
4586 static const char *
bi_shader_stage_name(bi_context * ctx)4587 bi_shader_stage_name(bi_context *ctx)
4588 {
4589 if (ctx->idvs == BI_IDVS_VARYING)
4590 return "MESA_SHADER_VARYING";
4591 else if (ctx->idvs == BI_IDVS_POSITION)
4592 return "MESA_SHADER_POSITION";
4593 else if (ctx->inputs->is_blend)
4594 return "MESA_SHADER_BLEND";
4595 else
4596 return gl_shader_stage_name(ctx->stage);
4597 }
4598
4599 static char *
bi_print_stats(bi_context * ctx,unsigned size)4600 bi_print_stats(bi_context *ctx, unsigned size)
4601 {
4602 struct bi_stats stats = {0};
4603
4604 /* Count instructions, clauses, and tuples. Also attempt to construct
4605 * normalized execution engine cycle counts, using the following ratio:
4606 *
4607 * 24 arith tuples/cycle
4608 * 2 texture messages/cycle
4609 * 16 x 16-bit varying channels interpolated/cycle
4610 * 1 load store message/cycle
4611 *
4612 * These numbers seem to match Arm Mobile Studio's heuristic. The real
4613 * cycle counts are surely more complicated.
4614 */
4615
4616 bi_foreach_block(ctx, block) {
4617 bi_foreach_clause_in_block(block, clause) {
4618 stats.nr_clauses++;
4619 stats.nr_tuples += clause->tuple_count;
4620
4621 for (unsigned i = 0; i < clause->tuple_count; ++i)
4622 bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4623 }
4624 }
4625
4626 float cycles_arith = ((float)stats.nr_arith) / 24.0;
4627 float cycles_texture = ((float)stats.nr_texture) / 2.0;
4628 float cycles_varying = ((float)stats.nr_varying) / 16.0;
4629 float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4630
4631 float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4632 float cycles_bound = MAX2(cycles_arith, cycles_message);
4633
4634 /* Thread count and register pressure are traded off only on v7 */
4635 bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4636 unsigned nr_threads = full_threads ? 2 : 1;
4637
4638 /* Dump stats */
4639 char *str = ralloc_asprintf(
4640 NULL,
4641 "%s shader: "
4642 "%u inst, %u tuples, %u clauses, "
4643 "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4644 "%u quadwords, %u threads",
4645 bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4646 stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4647 cycles_varying, cycles_ldst, size / 16, nr_threads);
4648
4649 if (ctx->arch == 7) {
4650 ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4651 }
4652
4653 ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4654 ctx->loop_count, ctx->spills, ctx->fills);
4655
4656 return str;
4657 }
4658
4659 static char *
va_print_stats(bi_context * ctx,unsigned size)4660 va_print_stats(bi_context *ctx, unsigned size)
4661 {
4662 unsigned nr_ins = 0;
4663 struct va_stats stats = {0};
4664
4665 /* Count instructions */
4666 bi_foreach_instr_global(ctx, I) {
4667 nr_ins++;
4668 va_count_instr_stats(I, &stats);
4669 }
4670
4671 /* Mali G78 peak performance:
4672 *
4673 * 64 FMA instructions per cycle
4674 * 64 CVT instructions per cycle
4675 * 16 SFU instructions per cycle
4676 * 8 x 32-bit varying channels interpolated per cycle
4677 * 4 texture instructions per cycle
4678 * 1 load/store operation per cycle
4679 */
4680
4681 float cycles_fma = ((float)stats.fma) / 64.0;
4682 float cycles_cvt = ((float)stats.cvt) / 64.0;
4683 float cycles_sfu = ((float)stats.sfu) / 16.0;
4684 float cycles_v = ((float)stats.v) / 16.0;
4685 float cycles_t = ((float)stats.t) / 4.0;
4686 float cycles_ls = ((float)stats.ls) / 1.0;
4687
4688 /* Calculate the bound */
4689 float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4690 MAX3(cycles_v, cycles_t, cycles_ls));
4691
4692 /* Thread count and register pressure are traded off */
4693 unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4694
4695 /* Dump stats */
4696 return ralloc_asprintf(NULL,
4697 "%s shader: "
4698 "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4699 "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4700 "%u:%u spills:fills",
4701 bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4702 cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4703 size / 16, nr_threads, ctx->loop_count, ctx->spills,
4704 ctx->fills);
4705 }
4706
4707 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4708 glsl_type_size(const struct glsl_type *type, bool bindless)
4709 {
4710 return glsl_count_attribute_slots(type, false);
4711 }
4712
4713 /* Split stores to memory. We don't split stores to vertex outputs, since
4714 * nir_lower_io_to_temporaries will ensure there's only a single write.
4715 */
4716
4717 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4718 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4719 {
4720 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4721
4722 switch (intr->intrinsic) {
4723 case nir_intrinsic_store_ssbo:
4724 case nir_intrinsic_store_shared:
4725 case nir_intrinsic_store_global:
4726 case nir_intrinsic_store_scratch:
4727 return true;
4728 default:
4729 return false;
4730 }
4731 }
4732
4733 /*
4734 * Some operations are only available as 32-bit instructions. 64-bit floats are
4735 * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and
4736 * 16-bit instructions, however, are lowered here.
4737 */
4738 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4739 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4740 {
4741 if (instr->type != nir_instr_type_alu)
4742 return 0;
4743
4744 nir_alu_instr *alu = nir_instr_as_alu(instr);
4745
4746 switch (alu->op) {
4747 case nir_op_fexp2:
4748 case nir_op_flog2:
4749 case nir_op_fpow:
4750 case nir_op_fsin:
4751 case nir_op_fcos:
4752 case nir_op_bit_count:
4753 case nir_op_bitfield_reverse:
4754 return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4755 default:
4756 return 0;
4757 }
4758 }
4759
4760 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4761 * transcendentals are an exception. Also shifts because of lane size mismatch
4762 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4763 * to be scalarized due to type size. */
4764
4765 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4766 bi_vectorize_filter(const nir_instr *instr, const void *data)
4767 {
4768 /* Defaults work for everything else */
4769 if (instr->type != nir_instr_type_alu)
4770 return 0;
4771
4772 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4773
4774 switch (alu->op) {
4775 case nir_op_frcp:
4776 case nir_op_frsq:
4777 case nir_op_ishl:
4778 case nir_op_ishr:
4779 case nir_op_ushr:
4780 case nir_op_f2i16:
4781 case nir_op_f2u16:
4782 case nir_op_extract_u8:
4783 case nir_op_extract_i8:
4784 case nir_op_extract_u16:
4785 case nir_op_extract_i16:
4786 case nir_op_insert_u16:
4787 return 1;
4788 default:
4789 break;
4790 }
4791
4792 /* Vectorized instructions cannot write more than 32-bit */
4793 int dst_bit_size = alu->def.bit_size;
4794 if (dst_bit_size == 16)
4795 return 2;
4796 else
4797 return 1;
4798 }
4799
4800 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4801 bi_scalarize_filter(const nir_instr *instr, const void *data)
4802 {
4803 if (instr->type != nir_instr_type_alu)
4804 return false;
4805
4806 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4807
4808 switch (alu->op) {
4809 case nir_op_pack_uvec2_to_uint:
4810 case nir_op_pack_uvec4_to_uint:
4811 return false;
4812 default:
4813 return true;
4814 }
4815 }
4816
4817 /* Ensure we write exactly 4 components */
4818 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4819 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4820 unsigned first, unsigned mask)
4821 {
4822 if (!(mask & BITFIELD_BIT(channel)))
4823 channel = first;
4824
4825 return nir_channel(b, in, channel);
4826 }
4827
4828 /* Lower fragment store_output instructions to always write 4 components,
4829 * matching the hardware semantic. This may require additional moves. Skipping
4830 * these moves is possible in theory, but invokes undefined behaviour in the
4831 * compiler. The DDK inserts these moves, so we will as well. */
4832
4833 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4834 bifrost_nir_lower_blend_components(struct nir_builder *b,
4835 nir_intrinsic_instr *intr, void *data)
4836 {
4837 if (intr->intrinsic != nir_intrinsic_store_output)
4838 return false;
4839
4840 nir_def *in = intr->src[0].ssa;
4841 unsigned first = nir_intrinsic_component(intr);
4842 unsigned mask = nir_intrinsic_write_mask(intr);
4843
4844 assert(first == 0 && "shouldn't get nonzero components");
4845
4846 /* Nothing to do */
4847 if (mask == BITFIELD_MASK(4))
4848 return false;
4849
4850 b->cursor = nir_before_instr(&intr->instr);
4851
4852 /* Replicate the first valid component instead */
4853 nir_def *replicated =
4854 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4855 bifrost_nir_valid_channel(b, in, 1, first, mask),
4856 bifrost_nir_valid_channel(b, in, 2, first, mask),
4857 bifrost_nir_valid_channel(b, in, 3, first, mask));
4858
4859 /* Rewrite to use our replicated version */
4860 nir_src_rewrite(&intr->src[0], replicated);
4861 nir_intrinsic_set_component(intr, 0);
4862 nir_intrinsic_set_write_mask(intr, 0xF);
4863 intr->num_components = 4;
4864
4865 return true;
4866 }
4867
4868 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)4869 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4870 uint8_t bit_size, uint32_t align_mul,
4871 uint32_t align_offset, bool offset_is_const,
4872 enum gl_access_qualifier access, const void *cb_data)
4873 {
4874 uint32_t align = nir_combined_align(align_mul, align_offset);
4875 assert(util_is_power_of_two_nonzero(align));
4876
4877 /* No more than 16 bytes at a time. */
4878 bytes = MIN2(bytes, 16);
4879
4880 /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4881 * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4882 *
4883 * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4884 * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4885 * the size.
4886 */
4887 if ((bytes & 1) || (align == 1))
4888 bit_size = 8;
4889 else if ((bytes & 2) || (align == 2))
4890 bit_size = 16;
4891 else if (bit_size >= 32)
4892 bit_size = 32;
4893
4894 unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);
4895
4896 /* Push constants require 32-bit loads. */
4897 if (intrin == nir_intrinsic_load_push_constant) {
4898 if (align_mul >= 4) {
4899 /* If align_mul is bigger than 4 we can use align_offset to find
4900 * the exact number of words we need to read.
4901 */
4902 num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
4903 } else {
4904 /* If bytes is aligned on 32-bit, the access might still cross one
4905 * word at the beginning, and one word at the end. If bytes is not
4906 * aligned on 32-bit, the extra two words should cover for both the
4907 * size and offset mis-alignment.
4908 */
4909 num_comps = (bytes / 4) + 2;
4910 }
4911
4912 bit_size = MAX2(bit_size, 32);
4913 align = 4;
4914 } else {
4915 align = bit_size / 8;
4916 }
4917
4918 return (nir_mem_access_size_align){
4919 .num_components = num_comps,
4920 .bit_size = bit_size,
4921 .align = align,
4922 .shift = nir_mem_access_shift_method_scalar,
4923 };
4924 }
4925
4926 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)4927 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
4928 unsigned num_components, int64_t hole_size,
4929 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
4930 void *data)
4931 {
4932 if (hole_size > 0)
4933 return false;
4934
4935 /* Must be aligned to the size of the load */
4936 unsigned align = nir_combined_align(align_mul, align_offset);
4937 if ((bit_size / 8) > align)
4938 return false;
4939
4940 if (num_components > 4)
4941 return false;
4942
4943 if (bit_size > 32)
4944 return false;
4945
4946 return true;
4947 }
4948
4949 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4950 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4951 {
4952 NIR_PASS(_, nir, nir_opt_shrink_stores, true);
4953
4954 bool progress;
4955
4956 do {
4957 progress = false;
4958
4959 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4960 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4961
4962 NIR_PASS(progress, nir, nir_copy_prop);
4963 NIR_PASS(progress, nir, nir_opt_remove_phis);
4964 NIR_PASS(progress, nir, nir_opt_dce);
4965 NIR_PASS(progress, nir, nir_opt_dead_cf);
4966 NIR_PASS(progress, nir, nir_opt_cse);
4967 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4968 NIR_PASS(progress, nir, nir_opt_algebraic);
4969 NIR_PASS(progress, nir, nir_opt_constant_folding);
4970
4971 NIR_PASS(progress, nir, nir_opt_undef);
4972 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4973
4974 NIR_PASS(progress, nir, nir_opt_shrink_vectors, false);
4975 NIR_PASS(progress, nir, nir_opt_loop_unroll);
4976 } while (progress);
4977
4978 NIR_PASS(
4979 progress, nir, nir_opt_load_store_vectorize,
4980 &(const nir_load_store_vectorize_options){
4981 .modes = nir_var_mem_global | nir_var_mem_shared | nir_var_shader_temp,
4982 .callback = mem_vectorize_cb,
4983 });
4984 NIR_PASS(progress, nir, nir_lower_pack);
4985
4986 /* nir_lower_pack can generate split operations, execute algebraic again to
4987 * handle them */
4988 NIR_PASS(progress, nir, nir_opt_algebraic);
4989
4990 /* TODO: Why is 64-bit getting rematerialized?
4991 * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4992 NIR_PASS(progress, nir, nir_lower_int64);
4993
4994 /* We need to cleanup after each iteration of late algebraic
4995 * optimizations, since otherwise NIR can produce weird edge cases
4996 * (like fneg of a constant) which we don't handle */
4997 bool late_algebraic = true;
4998 while (late_algebraic) {
4999 late_algebraic = false;
5000 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
5001 NIR_PASS(progress, nir, nir_opt_constant_folding);
5002 NIR_PASS(progress, nir, nir_copy_prop);
5003 NIR_PASS(progress, nir, nir_opt_dce);
5004 NIR_PASS(progress, nir, nir_opt_cse);
5005 }
5006
5007 /* This opt currently helps on Bifrost but not Valhall */
5008 if (gpu_id < 0x9000)
5009 NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
5010
5011 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5012 NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
5013 NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
5014
5015 /* Prepass to simplify instruction selection */
5016 late_algebraic = false;
5017 NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
5018
5019 while (late_algebraic) {
5020 late_algebraic = false;
5021 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
5022 NIR_PASS(progress, nir, nir_opt_constant_folding);
5023 NIR_PASS(progress, nir, nir_copy_prop);
5024 NIR_PASS(progress, nir, nir_opt_dce);
5025 NIR_PASS(progress, nir, nir_opt_cse);
5026 }
5027
5028 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
5029 NIR_PASS(progress, nir, nir_opt_dce);
5030
5031 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5032 NIR_PASS(_, nir, nir_shader_intrinsics_pass,
5033 bifrost_nir_lower_blend_components, nir_metadata_control_flow,
5034 NULL);
5035 }
5036
5037 /* Backend scheduler is purely local, so do some global optimizations
5038 * to reduce register pressure. */
5039 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
5040 nir_move_load_input | nir_move_comparisons |
5041 nir_move_copies | nir_move_load_ssbo;
5042
5043 NIR_PASS(_, nir, nir_opt_sink, move_all);
5044 NIR_PASS(_, nir, nir_opt_move, move_all);
5045
5046 /* We might lower attribute, varying, and image indirects. Use the
5047 * gathered info to skip the extra analysis in the happy path. */
5048 bool any_indirects = nir->info.inputs_read_indirectly ||
5049 nir->info.outputs_accessed_indirectly ||
5050 nir->info.patch_inputs_read_indirectly ||
5051 nir->info.patch_outputs_accessed_indirectly ||
5052 nir->info.images_used[0];
5053
5054 if (any_indirects) {
5055 nir_divergence_analysis(nir);
5056 NIR_PASS(_, nir, bi_lower_divergent_indirects,
5057 pan_subgroup_size(pan_arch(gpu_id)));
5058 }
5059 }
5060
5061 static void
bi_opt_post_ra(bi_context * ctx)5062 bi_opt_post_ra(bi_context *ctx)
5063 {
5064 bi_foreach_instr_global_safe(ctx, ins) {
5065 if (ins->op == BI_OPCODE_MOV_I32 &&
5066 bi_is_equiv(ins->dest[0], ins->src[0]))
5067 bi_remove_instruction(ins);
5068 }
5069 }
5070
5071 /* Dead code elimination for branches at the end of a block - only one branch
5072 * per block is legal semantically, but unreachable jumps can be generated.
5073 * Likewise on Bifrost we can generate jumps to the terminal block which need
5074 * to be lowered away to a jump to #0x0, which induces successful termination.
5075 * That trick doesn't work on Valhall, which needs a NOP inserted in the
5076 * terminal block instead.
5077 */
5078 static void
bi_lower_branch(bi_context * ctx,bi_block * block)5079 bi_lower_branch(bi_context *ctx, bi_block *block)
5080 {
5081 bool cull_terminal = (ctx->arch <= 8);
5082 bool branched = false;
5083
5084 bi_foreach_instr_in_block_safe(block, ins) {
5085 if (!ins->branch_target)
5086 continue;
5087
5088 if (branched) {
5089 bi_remove_instruction(ins);
5090 continue;
5091 }
5092
5093 branched = true;
5094
5095 if (!bi_is_terminal_block(ins->branch_target))
5096 continue;
5097
5098 if (cull_terminal)
5099 ins->branch_target = NULL;
5100 else if (ins->branch_target)
5101 ins->branch_target->needs_nop = true;
5102 }
5103 }
5104
5105 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)5106 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
5107 {
5108 unsigned final_clause = bi_pack(ctx, binary);
5109
5110 /* If we need to wait for ATEST or BLEND in the first clause, pass the
5111 * corresponding bits through to the renderer state descriptor */
5112 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5113 bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
5114
5115 unsigned first_deps = first_clause ? first_clause->dependencies : 0;
5116 ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
5117 ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
5118
5119 /* Pad the shader with enough zero bytes to trick the prefetcher,
5120 * unless we're compiling an empty shader (in which case we don't pad
5121 * so the size remains 0) */
5122 unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
5123
5124 if (binary->size - offset) {
5125 memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
5126 prefetch_size);
5127 }
5128 }
5129
5130 /*
5131 * Build a bit mask of varyings (by location) that are flatshaded. This
5132 * information is needed by lower_mediump_io, as we don't yet support 16-bit
5133 * flat varyings.
5134 *
5135 * Also varyings that are used as texture coordinates should be kept at fp32 so
5136 * the texture instruction may be promoted to VAR_TEX. In general this is a good
5137 * idea, as fp16 texture coordinates are not supported by the hardware and are
5138 * usually inappropriate. (There are both relevant CTS bugs here, even.)
5139 *
5140 * TODO: If we compacted the varyings with some fixup code in the vertex shader,
5141 * we could implement 16-bit flat varyings. Consider if this case matters.
5142 *
5143 * TODO: The texture coordinate handling could be less heavyhanded.
5144 */
5145 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)5146 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
5147 {
5148 uint64_t *mask = data;
5149
5150 if (instr->type != nir_instr_type_tex)
5151 return false;
5152
5153 nir_tex_instr *tex = nir_instr_as_tex(instr);
5154
5155 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
5156 if (coord_idx < 0)
5157 return false;
5158
5159 nir_src src = tex->src[coord_idx].src;
5160 nir_scalar x = nir_scalar_resolved(src.ssa, 0);
5161 nir_scalar y = nir_scalar_resolved(src.ssa, 1);
5162
5163 if (x.def != y.def)
5164 return false;
5165
5166 nir_instr *parent = x.def->parent_instr;
5167
5168 if (parent->type != nir_instr_type_intrinsic)
5169 return false;
5170
5171 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
5172
5173 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
5174 return false;
5175
5176 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
5177 *mask |= BITFIELD64_BIT(sem.location);
5178 return false;
5179 }
5180
5181 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)5182 bi_fp32_varying_mask(nir_shader *nir)
5183 {
5184 uint64_t mask = 0;
5185
5186 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
5187
5188 nir_foreach_shader_in_variable(var, nir) {
5189 if (var->data.interpolation == INTERP_MODE_FLAT)
5190 mask |= BITFIELD64_BIT(var->data.location);
5191 }
5192
5193 nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
5194 &mask);
5195
5196 return mask;
5197 }
5198
5199 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)5200 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
5201 void *data)
5202 {
5203 if (intr->intrinsic != nir_intrinsic_store_output)
5204 return false;
5205
5206 assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
5207 if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
5208 return false;
5209
5210 b->cursor = nir_before_instr(&intr->instr);
5211
5212 nir_def *orig = nir_load_sample_mask(b);
5213
5214 nir_src_rewrite(&intr->src[0], nir_iand(b, orig, intr->src[0].ssa));
5215 return true;
5216 }
5217
5218 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)5219 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
5220 UNUSED void *data)
5221 {
5222 if (intr->intrinsic != nir_intrinsic_load_output)
5223 return false;
5224
5225 unsigned loc = nir_intrinsic_io_semantics(intr).location;
5226 assert(loc >= FRAG_RESULT_DATA0);
5227 unsigned rt = loc - FRAG_RESULT_DATA0;
5228
5229 b->cursor = nir_before_instr(&intr->instr);
5230
5231 nir_def *conversion = nir_load_rt_conversion_pan(
5232 b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
5233
5234 nir_def *lowered = nir_load_converted_output_pan(
5235 b, intr->def.num_components, intr->def.bit_size, conversion,
5236 .dest_type = nir_intrinsic_dest_type(intr),
5237 .io_semantics = nir_intrinsic_io_semantics(intr));
5238
5239 nir_def_rewrite_uses(&intr->def, lowered);
5240 return true;
5241 }
5242
5243 static bool
bi_lower_subgroups(nir_builder * b,nir_intrinsic_instr * intr,void * data)5244 bi_lower_subgroups(nir_builder *b, nir_intrinsic_instr *intr, void *data)
5245 {
5246 unsigned int gpu_id = *(unsigned int *)data;
5247 unsigned int arch = pan_arch(gpu_id);
5248
5249 b->cursor = nir_before_instr(&intr->instr);
5250
5251 nir_def *val = NULL;
5252 switch (intr->intrinsic) {
5253 case nir_intrinsic_vote_any:
5254 val = nir_ine_imm(b, nir_ballot(b, 1, 32, intr->src[0].ssa), 0);
5255 break;
5256
5257 case nir_intrinsic_vote_all:
5258 val = nir_ieq_imm(b, nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa)), 0);
5259 break;
5260
5261 case nir_intrinsic_load_subgroup_id: {
5262 nir_def *local_id = nir_load_local_invocation_id(b);
5263 nir_def *local_size = nir_load_workgroup_size(b);
5264 /* local_id.x + local_size.x * (local_id.y + local_size.y * local_id.z) */
5265 nir_def *flat_local_id =
5266 nir_iadd(b,
5267 nir_channel(b, local_id, 0),
5268 nir_imul(b,
5269 nir_channel(b, local_size, 0),
5270 nir_iadd(b,
5271 nir_channel(b, local_id, 1),
5272 nir_imul(b,
5273 nir_channel(b, local_size, 1),
5274 nir_channel(b, local_id, 2)))));
5275 /*
5276 * nir_udiv_imm with a power of two divisor, which pan_subgroup_size is,
5277 * will construct a right shift instead of an udiv.
5278 */
5279 val = nir_udiv_imm(b, flat_local_id, pan_subgroup_size(arch));
5280 break;
5281 }
5282
5283 case nir_intrinsic_load_subgroup_size:
5284 val = nir_imm_int(b, pan_subgroup_size(arch));
5285 break;
5286
5287 case nir_intrinsic_load_num_subgroups: {
5288 uint32_t subgroup_size = pan_subgroup_size(arch);
5289 assert(!b->shader->info.workgroup_size_variable);
5290 uint32_t workgroup_size =
5291 b->shader->info.workgroup_size[0] *
5292 b->shader->info.workgroup_size[1] *
5293 b->shader->info.workgroup_size[2];
5294 uint32_t num_subgroups = DIV_ROUND_UP(workgroup_size, subgroup_size);
5295 val = nir_imm_int(b, num_subgroups);
5296 break;
5297 }
5298
5299 default:
5300 return false;
5301 }
5302
5303 nir_def_rewrite_uses(&intr->def, val);
5304 return true;
5305 }
5306
5307 bool
bifrost_nir_lower_load_output(nir_shader * nir)5308 bifrost_nir_lower_load_output(nir_shader *nir)
5309 {
5310 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
5311
5312 return nir_shader_intrinsics_pass(
5313 nir, bi_lower_load_output,
5314 nir_metadata_control_flow, NULL);
5315 }
5316
5317 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)5318 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
5319 {
5320 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
5321 * (so we don't accidentally duplicate the epilogue since mesa/st has
5322 * messed with our I/O quite a bit already) */
5323
5324 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5325
5326 if (nir->info.stage == MESA_SHADER_VERTEX) {
5327 if (pan_arch(gpu_id) <= 7)
5328 NIR_PASS(_, nir, pan_nir_lower_vertex_id);
5329
5330 NIR_PASS(_, nir, nir_lower_viewport_transform);
5331 NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
5332
5333 nir_variable *psiz = nir_find_variable_with_location(
5334 nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
5335 if (psiz != NULL)
5336 psiz->data.precision = GLSL_PRECISION_MEDIUM;
5337 }
5338
5339 /* Get rid of any global vars before we lower to scratch. */
5340 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
5341
5342 /* Valhall introduces packed thread local storage, which improves cache
5343 * locality of TLS access. However, access to packed TLS cannot
5344 * straddle 16-byte boundaries. As such, when packed TLS is in use
5345 * (currently unconditional for Valhall), we force vec4 alignment for
5346 * scratch access.
5347 */
5348 glsl_type_size_align_func vars_to_scratch_size_align_func =
5349 (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
5350 : glsl_get_natural_size_align_bytes;
5351 /* Lower large arrays to scratch and small arrays to bcsel */
5352 NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
5353 vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
5354 NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
5355
5356 NIR_PASS(_, nir, nir_split_var_copies);
5357 NIR_PASS(_, nir, nir_lower_var_copies);
5358 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5359 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
5360 glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics);
5361
5362 if (nir->info.stage == MESA_SHADER_VERTEX)
5363 NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
5364 if (nir->info.stage == MESA_SHADER_FRAGMENT)
5365 NIR_PASS(_, nir, pan_nir_lower_noperspective_fs);
5366
5367 /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
5368 * offsets it could figure out are constant. Do some constant folding
5369 * before bifrost_nir_lower_store_component below.
5370 */
5371 NIR_PASS(_, nir, nir_opt_constant_folding);
5372
5373 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5374 NIR_PASS(_, nir, nir_lower_mediump_io,
5375 nir_var_shader_in | nir_var_shader_out,
5376 ~bi_fp32_varying_mask(nir), false);
5377
5378 NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
5379 nir_metadata_control_flow, NULL);
5380
5381 NIR_PASS(_, nir, bifrost_nir_lower_load_output);
5382 } else if (nir->info.stage == MESA_SHADER_VERTEX) {
5383 if (gpu_id >= 0x9000) {
5384 NIR_PASS(_, nir, nir_lower_mediump_io, nir_var_shader_out,
5385 BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
5386 }
5387
5388 NIR_PASS(_, nir, pan_nir_lower_store_component);
5389 }
5390
5391 nir_lower_mem_access_bit_sizes_options mem_size_options = {
5392 .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
5393 nir_var_mem_constant | nir_var_mem_task_payload |
5394 nir_var_shader_temp | nir_var_function_temp |
5395 nir_var_mem_global | nir_var_mem_shared,
5396 .callback = mem_access_size_align_cb,
5397 };
5398 NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
5399
5400 nir_lower_ssbo_options ssbo_opts = {
5401 .native_loads = pan_arch(gpu_id) >= 9,
5402 .native_offset = pan_arch(gpu_id) >= 9,
5403 };
5404 NIR_PASS(_, nir, nir_lower_ssbo, &ssbo_opts);
5405
5406 NIR_PASS(_, nir, pan_lower_sample_pos);
5407 NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
5408 NIR_PASS(_, nir, nir_lower_64bit_phis);
5409 NIR_PASS(_, nir, pan_lower_helper_invocation);
5410 NIR_PASS(_, nir, nir_lower_int64);
5411
5412 NIR_PASS(_, nir, nir_opt_idiv_const, 8);
5413 NIR_PASS(_, nir, nir_lower_idiv,
5414 &(nir_lower_idiv_options){.allow_fp16 = true});
5415
5416 NIR_PASS(_, nir, nir_lower_tex,
5417 &(nir_lower_tex_options){
5418 .lower_txs_lod = true,
5419 .lower_txp = ~0,
5420 .lower_tg4_broadcom_swizzle = true,
5421 .lower_txd_cube_map = true,
5422 .lower_invalid_implicit_lod = true,
5423 .lower_index_to_offset = true,
5424 });
5425
5426 NIR_PASS(_, nir, nir_lower_image_atomics_to_global);
5427
5428 /* on bifrost, lower MSAA load/stores to 3D load/stores */
5429 if (pan_arch(gpu_id) < 9)
5430 NIR_PASS(_, nir, pan_nir_lower_image_ms);
5431
5432 /*
5433 * TODO: we can implement certain operations (notably reductions, scans,
5434 * certain shuffles, etc) more efficiently than nir_lower_subgroups. Moreover
5435 * we can implement reductions and scans on f16vec2 values without splitting
5436 * to scalar first.
5437 */
5438 bool lower_subgroups_progress = false;
5439 NIR_PASS(lower_subgroups_progress, nir, nir_lower_subgroups,
5440 &(nir_lower_subgroups_options) {
5441 .subgroup_size = pan_subgroup_size(pan_arch(gpu_id)),
5442 .ballot_bit_size = 32,
5443 .ballot_components = 1,
5444 .lower_to_scalar = true,
5445 .lower_vote_eq = true,
5446 .lower_vote_bool_eq = true,
5447 .lower_first_invocation_to_ballot = true,
5448 .lower_read_first_invocation = true,
5449 .lower_subgroup_masks = true,
5450 .lower_relative_shuffle = true,
5451 .lower_shuffle = true,
5452 .lower_quad = true,
5453 .lower_quad_broadcast_dynamic = true,
5454 .lower_quad_vote = true,
5455 .lower_elect = true,
5456 .lower_rotate_to_shuffle = true,
5457 .lower_rotate_clustered_to_shuffle = true,
5458 .lower_inverse_ballot = true,
5459 .lower_reduce = true,
5460 .lower_boolean_reduce = true,
5461 .lower_boolean_shuffle = true,
5462 });
5463 /* nir_lower_subgroups creates new vars, clean them up. */
5464 if (lower_subgroups_progress)
5465 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
5466
5467 NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups,
5468 nir_metadata_control_flow, &gpu_id);
5469
5470 NIR_PASS(_, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5471 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
5472 NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
5473 NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
5474 NIR_PASS(_, nir, nir_lower_var_copies);
5475 NIR_PASS(_, nir, nir_lower_alu);
5476 NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
5477 NIR_PASS(_, nir, pan_nir_lower_frag_coord_zw);
5478 }
5479
5480 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)5481 bi_compile_variant_nir(nir_shader *nir,
5482 const struct panfrost_compile_inputs *inputs,
5483 struct util_dynarray *binary, struct bi_shader_info info,
5484 enum bi_idvs_mode idvs)
5485 {
5486 bi_context *ctx = rzalloc(NULL, bi_context);
5487
5488 /* There may be another program in the dynarray, start at the end */
5489 unsigned offset = binary->size;
5490
5491 ctx->inputs = inputs;
5492 ctx->nir = nir;
5493 ctx->stage = nir->info.stage;
5494 ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
5495 ctx->arch = pan_arch(inputs->gpu_id);
5496 ctx->info = info;
5497 ctx->idvs = idvs;
5498 ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
5499
5500 if (idvs != BI_IDVS_NONE) {
5501 /* Specializing shaders for IDVS is destructive, so we need to
5502 * clone. However, the last (second) IDVS shader does not need
5503 * to be preserved so we can skip cloning that one.
5504 */
5505 if (offset == 0)
5506 ctx->nir = nir = nir_shader_clone(ctx, nir);
5507
5508 NIR_PASS(_, nir, nir_shader_instructions_pass,
5509 bifrost_nir_specialize_idvs, nir_metadata_control_flow, &idvs);
5510
5511 /* After specializing, clean up the mess */
5512 bool progress = true;
5513
5514 while (progress) {
5515 progress = false;
5516
5517 NIR_PASS(progress, nir, nir_opt_dce);
5518 NIR_PASS(progress, nir, nir_opt_dead_cf);
5519 }
5520 }
5521
5522 /* If nothing is pushed, all UBOs need to be uploaded */
5523 ctx->ubo_mask = ~0;
5524
5525 list_inithead(&ctx->blocks);
5526
5527 bool skip_internal = nir->info.internal;
5528 skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
5529
5530 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5531 nir_print_shader(nir, stdout);
5532 }
5533
5534 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
5535
5536 nir_foreach_function_impl(impl, nir) {
5537 nir_index_blocks(impl);
5538
5539 ctx->indexed_nir_blocks =
5540 rzalloc_array(ctx, bi_block *, impl->num_blocks);
5541
5542 ctx->ssa_alloc += impl->ssa_alloc;
5543
5544 emit_cf_list(ctx, &impl->body);
5545 bi_emit_phis_deferred(ctx);
5546 break; /* TODO: Multi-function shaders */
5547 }
5548
5549 /* Index blocks now that we're done emitting */
5550 bi_foreach_block(ctx, block) {
5551 block->index = ctx->num_blocks++;
5552 }
5553
5554 bi_validate(ctx, "NIR -> BIR");
5555
5556 /* If the shader doesn't write any colour or depth outputs, it may
5557 * still need an ATEST at the very end! */
5558 bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
5559 !ctx->emitted_atest && !bi_skip_atest(ctx, false);
5560
5561 if (need_dummy_atest) {
5562 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5563 bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5564 bi_emit_atest(&b, bi_zero());
5565 }
5566
5567 bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5568
5569 /* Runs before constant folding */
5570 bi_lower_swizzle(ctx);
5571 bi_validate(ctx, "Early lowering");
5572
5573 /* Runs before copy prop */
5574 if (optimize && !ctx->inputs->no_ubo_to_push) {
5575 bi_opt_push_ubo(ctx);
5576 }
5577
5578 if (likely(optimize)) {
5579 bi_opt_copy_prop(ctx);
5580
5581 while (bi_opt_constant_fold(ctx))
5582 bi_opt_copy_prop(ctx);
5583
5584 bi_opt_mod_prop_forward(ctx);
5585 bi_opt_mod_prop_backward(ctx);
5586
5587 /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5588 * mod_prop_backward to fuse VAR_TEX */
5589 if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5590 !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5591 bi_opt_dce(ctx, false);
5592 bi_opt_message_preload(ctx);
5593 bi_opt_copy_prop(ctx);
5594 }
5595
5596 bi_opt_dce(ctx, false);
5597 bi_opt_cse(ctx);
5598 bi_opt_dce(ctx, false);
5599 if (!ctx->inputs->no_ubo_to_push)
5600 bi_opt_reorder_push(ctx);
5601 bi_validate(ctx, "Optimization passes");
5602 }
5603
5604 bi_lower_opt_instructions(ctx);
5605
5606 if (ctx->arch >= 9) {
5607 va_optimize(ctx);
5608 va_lower_isel(ctx);
5609
5610 bi_foreach_instr_global_safe(ctx, I) {
5611 /* Phis become single moves so shouldn't be affected */
5612 if (I->op == BI_OPCODE_PHI)
5613 continue;
5614
5615 va_lower_constants(ctx, I);
5616
5617 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5618 va_repair_fau(&b, I);
5619 }
5620
5621 /* We need to clean up after constant lowering */
5622 if (likely(optimize)) {
5623 bi_opt_cse(ctx);
5624 bi_opt_dce(ctx, false);
5625 }
5626
5627 bi_validate(ctx, "Valhall passes");
5628 }
5629
5630 bi_foreach_block(ctx, block) {
5631 bi_lower_branch(ctx, block);
5632 }
5633
5634 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5635 bi_print_shader(ctx, stdout);
5636
5637 /* Analyze before register allocation to avoid false dependencies. The
5638 * skip bit is a function of only the data flow graph and is invariant
5639 * under valid scheduling. Helpers are only defined for fragment
5640 * shaders, so this analysis is only required in fragment shaders.
5641 */
5642 if (ctx->stage == MESA_SHADER_FRAGMENT) {
5643 bi_opt_dce(ctx, false);
5644 bi_analyze_helper_requirements(ctx);
5645 }
5646
5647 /* Fuse TEXC after analyzing helper requirements so the analysis
5648 * doesn't have to know about dual textures */
5649 if (likely(optimize)) {
5650 bi_opt_fuse_dual_texture(ctx);
5651 }
5652
5653 /* Lower FAU after fusing dual texture, because fusing dual texture
5654 * creates new immediates that themselves may need lowering.
5655 */
5656 if (ctx->arch <= 8) {
5657 bi_lower_fau(ctx);
5658 }
5659
5660 /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5661 if (likely(optimize)) {
5662 bi_opt_cse(ctx);
5663 bi_opt_dce(ctx, false);
5664 }
5665
5666 bi_validate(ctx, "Late lowering");
5667
5668 if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
5669 bi_pressure_schedule(ctx);
5670 bi_validate(ctx, "Pre-RA scheduling");
5671 }
5672
5673 bi_register_allocate(ctx);
5674
5675 if (likely(optimize))
5676 bi_opt_post_ra(ctx);
5677
5678 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5679 bi_print_shader(ctx, stdout);
5680
5681 if (ctx->arch >= 9) {
5682 va_assign_slots(ctx);
5683 va_insert_flow_control_nops(ctx);
5684 va_merge_flow(ctx);
5685 va_mark_last(ctx);
5686 } else {
5687 bi_schedule(ctx);
5688 bi_assign_scoreboard(ctx);
5689
5690 /* Analyze after scheduling since we depend on instruction
5691 * order. Valhall calls as part of va_insert_flow_control_nops,
5692 * as the handling for clauses differs from instructions.
5693 */
5694 bi_analyze_helper_terminate(ctx);
5695 bi_mark_clauses_td(ctx);
5696 }
5697
5698 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5699 bi_print_shader(ctx, stdout);
5700
5701 if (ctx->arch <= 8) {
5702 bi_pack_clauses(ctx, binary, offset);
5703 } else {
5704 bi_pack_valhall(ctx, binary);
5705 }
5706
5707 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5708 if (ctx->arch <= 8) {
5709 disassemble_bifrost(stdout, binary->data + offset,
5710 binary->size - offset,
5711 bifrost_debug & BIFROST_DBG_VERBOSE);
5712 } else {
5713 disassemble_valhall(stdout, binary->data + offset,
5714 binary->size - offset,
5715 bifrost_debug & BIFROST_DBG_VERBOSE);
5716 }
5717
5718 fflush(stdout);
5719 }
5720
5721 if (!skip_internal &&
5722 ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
5723 char *shaderdb;
5724
5725 if (ctx->arch >= 9) {
5726 shaderdb = va_print_stats(ctx, binary->size - offset);
5727 } else {
5728 shaderdb = bi_print_stats(ctx, binary->size - offset);
5729 }
5730
5731 if (bifrost_debug & BIFROST_DBG_SHADERDB)
5732 fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
5733
5734 if (inputs->debug)
5735 util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
5736
5737 ralloc_free(shaderdb);
5738 }
5739
5740 return ctx;
5741 }
5742
5743 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)5744 bi_compile_variant(nir_shader *nir,
5745 const struct panfrost_compile_inputs *inputs,
5746 struct util_dynarray *binary, struct pan_shader_info *info,
5747 enum bi_idvs_mode idvs)
5748 {
5749 struct bi_shader_info local_info = {
5750 .push = &info->push,
5751 .bifrost = &info->bifrost,
5752 .tls_size = info->tls_size,
5753 .push_offset = info->push.count,
5754 };
5755
5756 unsigned offset = binary->size;
5757
5758 /* If there is no position shader (gl_Position is not written), then
5759 * there is no need to build a varying shader either. This case is hit
5760 * for transform feedback only vertex shaders which only make sense with
5761 * rasterizer discard.
5762 */
5763 if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5764 return;
5765
5766 /* Software invariant: Only a secondary shader can appear at a nonzero
5767 * offset, to keep the ABI simple. */
5768 assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5769
5770 bi_context *ctx =
5771 bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5772
5773 /* A register is preloaded <==> it is live before the first block */
5774 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5775 uint64_t preload = first_block->reg_live_in;
5776
5777 /* If multisampling is used with a blend shader, the blend shader needs
5778 * to access the sample coverage mask in r60 and the sample ID in r61.
5779 * Blend shaders run in the same context as fragment shaders, so if a
5780 * blend shader could run, we need to preload these registers
5781 * conservatively. There is believed to be little cost to doing so, so
5782 * do so always to avoid variants of the preload descriptor.
5783 *
5784 * We only do this on Valhall, as Bifrost has to update the RSD for
5785 * multisampling w/ blend shader anyway, so this is handled in the
5786 * driver. We could unify the paths if the cost is acceptable.
5787 */
5788 if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5789 preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5790
5791 info->ubo_mask |= ctx->ubo_mask;
5792 info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5793
5794 if (idvs == BI_IDVS_VARYING) {
5795 info->vs.secondary_enable = (binary->size > offset);
5796 info->vs.secondary_offset = offset;
5797 info->vs.secondary_preload = preload;
5798 info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5799 } else {
5800 info->preload = preload;
5801 info->work_reg_count = ctx->info.work_reg_count;
5802 }
5803
5804 if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5805 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5806 /* Find the psiz write */
5807 bi_instr *write = NULL;
5808
5809 bi_foreach_instr_global(ctx, I) {
5810 if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5811 write = I;
5812 break;
5813 }
5814 }
5815
5816 assert(write != NULL);
5817
5818 /* NOP it out, preserving its flow control. TODO: maybe DCE */
5819 if (write->flow) {
5820 bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5821 bi_instr *nop = bi_nop(&b);
5822 nop->flow = write->flow;
5823 }
5824
5825 bi_remove_instruction(write);
5826
5827 info->vs.no_psiz_offset = binary->size;
5828 bi_pack_valhall(ctx, binary);
5829 }
5830
5831 ralloc_free(ctx);
5832 }
5833
5834 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5835 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5836 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5837 {
5838 /* Opt-out */
5839 if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5840 return false;
5841
5842 /* IDVS splits up vertex shaders, not defined on other shader stages */
5843 if (nir->info.stage != MESA_SHADER_VERTEX)
5844 return false;
5845
5846 /* Bifrost cannot write gl_PointSize during IDVS */
5847 if ((inputs->gpu_id < 0x9000) &&
5848 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5849 return false;
5850
5851 /* Otherwise, IDVS is usually better */
5852 return true;
5853 }
5854
5855 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5856 bifrost_compile_shader_nir(nir_shader *nir,
5857 const struct panfrost_compile_inputs *inputs,
5858 struct util_dynarray *binary,
5859 struct pan_shader_info *info)
5860 {
5861 bifrost_debug = debug_get_option_bifrost_debug();
5862
5863 /* Combine stores late, to give the driver a chance to lower dual-source
5864 * blending as regular store_output intrinsics.
5865 */
5866 NIR_PASS(_, nir, pan_nir_lower_zs_store);
5867
5868 bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5869
5870 info->tls_size = nir->scratch_size;
5871 info->vs.idvs = bi_should_idvs(nir, inputs);
5872
5873 pan_nir_collect_varyings(nir, info);
5874
5875 if (info->vs.idvs) {
5876 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5877 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5878 } else {
5879 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5880 }
5881
5882 if (gl_shader_stage_is_compute(nir->info.stage)) {
5883 /* Workgroups may be merged if the structure of the workgroup is
5884 * not software visible. This is true if neither shared memory
5885 * nor barriers are used. The hardware may be able to optimize
5886 * compute shaders that set this flag.
5887 */
5888 info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5889 !nir->info.uses_control_barrier &&
5890 !nir->info.uses_memory_barrier;
5891 }
5892
5893 info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5894 }
5895