/* * Copyright (C) 2020 Collabora Ltd. * Copyright (C) 2022 Alyssa Rosenzweig * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors (Collabora): * Alyssa Rosenzweig */ #include "compiler/glsl/glsl_to_nir.h" #include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_schedule.h" #include "util/u_debug.h" #include "disassemble.h" #include "valhall/va_compiler.h" #include "valhall/disassemble.h" #include "bifrost_compile.h" #include "compiler.h" #include "valhall/va_compiler.h" #include "bi_quirks.h" #include "bi_builder.h" #include "bifrost_nir.h" static const struct debug_named_value bifrost_debug_options[] = { {"msgs", BIFROST_DBG_MSGS, "Print debug messages"}, {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"}, {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"}, {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"}, {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"}, {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"}, {"spill", BIFROST_DBG_SPILL, "Test register spilling"}, DEBUG_NAMED_VALUE_END }; DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) /* How many bytes are prefetched by the Bifrost shader core. From the final * clause of the shader, this range must be valid instructions or zero. */ #define BIFROST_SHADER_PREFETCH 128 int bifrost_debug = 0; #define DBG(fmt, ...) \ do { if (bifrost_debug & BIFROST_DBG_MSGS) \ fprintf(stderr, "%s:%d: "fmt, \ __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); static bi_index bi_preload(bi_builder *b, unsigned reg) { if (bi_is_null(b->shader->preloaded[reg])) { /* Insert at the beginning of the shader */ bi_builder b_ = *b; b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks)); /* Cache the result */ b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg)); } return b->shader->preloaded[reg]; } static bi_index bi_coverage(bi_builder *b) { if (bi_is_null(b->shader->coverage)) b->shader->coverage = bi_preload(b, 60); return b->shader->coverage; } /* * Vertex ID and Instance ID are preloaded registers. Where they are preloaded * changed from Bifrost to Valhall. Provide helpers that smooth over the * architectural difference. */ static inline bi_index bi_vertex_id(bi_builder *b) { return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); } static inline bi_index bi_instance_id(bi_builder *b) { return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); } static void bi_emit_jump(bi_builder *b, nir_jump_instr *instr) { bi_instr *branch = bi_jump(b, bi_zero()); switch (instr->type) { case nir_jump_break: branch->branch_target = b->shader->break_block; break; case nir_jump_continue: branch->branch_target = b->shader->continue_block; break; default: unreachable("Unhandled jump type"); } bi_block_add_successor(b->shader->current_block, branch->branch_target); b->shader->current_block->unconditional_jumps = true; } /* Builds a 64-bit hash table key for an index */ static uint64_t bi_index_to_key(bi_index idx) { static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding"); uint64_t key = 0; memcpy(&key, &idx, sizeof(idx)); return key; } /* * Extract a single channel out of a vector source. We split vectors with SPLIT * so we can use the split components directly, without emitting an extract. * This has advantages of RA, as the split can usually be optimized away. */ static bi_index bi_extract(bi_builder *b, bi_index vec, unsigned channel) { /* Extract caching relies on SSA form. It is incorrect for nir_register. * Bypass the cache and emit an explicit split for registers. */ if (vec.reg) { bi_instr *I = bi_split_i32_to(b, bi_null(), vec); I->nr_dests = channel + 1; I->dest[channel] = bi_temp(b->shader); return I->dest[channel]; } bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec, bi_index_to_key(vec)); /* No extract needed for scalars. * * This is a bit imprecise, but actual bugs (missing splits for vectors) * should be caught by the following assertion. It is too difficult to * ensure bi_extract is only called for real vectors. */ if (components == NULL && channel == 0) return vec; assert(components != NULL && "missing bi_cache_collect()"); return components[channel]; } static void bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n) { /* Lifetime of a hash table entry has to be at least as long as the table */ bi_index *channels = ralloc_array(b->shader, bi_index, n); memcpy(channels, s, sizeof(bi_index) * n); _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst), channels); } /* * Splits an n-component vector (vec) into n scalar destinations (dests) using a * split pseudo-instruction. * * Pre-condition: dests is filled with bi_null(). */ static void bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n) { /* Setup the destinations */ for (unsigned i = 0; i < n; ++i) { dests[i] = bi_temp(b->shader); } /* Emit the split */ if (n == 1) { bi_mov_i32_to(b, dests[0], vec); } else { bi_instr *I = bi_split_i32_to(b, dests[0], vec); I->nr_dests = n; for (unsigned j = 1; j < n; ++j) I->dest[j] = dests[j]; } } static void bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n) { bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; bi_emit_split_i32(b, dests, vec, n); bi_cache_collect(b, vec, dests, n); } /* * Emit and cache a split for a vector of a given bitsize. The vector may not be * composed of 32-bit words, but it will be split at 32-bit word boundaries. */ static void bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits) { bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32)); } static void bi_split_dest(bi_builder *b, nir_dest dest) { bi_emit_cached_split(b, bi_dest_index(&dest), nir_dest_bit_size(dest) * nir_dest_num_components(dest)); } static bi_instr * bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n) { /* Special case: COLLECT of a single value is a scalar move */ if (n == 1) return bi_mov_i32_to(b, dst, chan[0]); bi_instr *I = bi_collect_i32_to(b, dst); I->nr_srcs = n; for (unsigned i = 0; i < n; ++i) I->src[i] = chan[i]; bi_cache_collect(b, dst, chan, n); return I; } static bi_instr * bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1) { return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2); } static bi_instr * bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2) { return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3); } static bi_index bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1) { bi_index dst = bi_temp(b->shader); bi_collect_v2i32_to(b, dst, s0, s1); return dst; } static bi_index bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) { switch (intr->intrinsic) { case nir_intrinsic_load_barycentric_centroid: case nir_intrinsic_load_barycentric_sample: return bi_preload(b, 61); /* Need to put the sample ID in the top 16-bits */ case nir_intrinsic_load_barycentric_at_sample: return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false), bi_half(bi_src_index(&intr->src[0]), false)); /* Interpret as 8:8 signed fixed point positions in pixels along X and * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) * is the center of the pixel so we first fixup and then convert. For * fp16 input: * * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = * f2i16((256 * (x, y)) + (128, 128)) = * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) * * For fp32 input, that lacks enough precision for MSAA 16x, but the * idea is the same. FIXME: still doesn't pass */ case nir_intrinsic_load_barycentric_at_offset: { bi_index offset = bi_src_index(&intr->src[0]); bi_index f16 = bi_null(); unsigned sz = nir_src_bit_size(intr->src[0]); if (sz == 16) { f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0)); } else { assert(sz == 32); bi_index f[2]; for (unsigned i = 0; i < 2; ++i) { f[i] = bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5), bi_imm_u32(8), BI_SPECIAL_NONE); } f16 = bi_v2f32_to_v2f16(b, f[0], f[1]); } return bi_v2f16_to_v2s16(b, f16); } case nir_intrinsic_load_barycentric_pixel: default: return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); } } static enum bi_sample bi_interp_for_intrinsic(nir_intrinsic_op op) { switch (op) { case nir_intrinsic_load_barycentric_centroid: return BI_SAMPLE_CENTROID; case nir_intrinsic_load_barycentric_sample: case nir_intrinsic_load_barycentric_at_sample: return BI_SAMPLE_SAMPLE; case nir_intrinsic_load_barycentric_at_offset: return BI_SAMPLE_EXPLICIT; case nir_intrinsic_load_barycentric_pixel: default: return BI_SAMPLE_CENTER; } } /* auto, 64-bit omitted */ static enum bi_register_format bi_reg_fmt_for_nir(nir_alu_type T) { switch (T) { case nir_type_float16: return BI_REGISTER_FORMAT_F16; case nir_type_float32: return BI_REGISTER_FORMAT_F32; case nir_type_int16: return BI_REGISTER_FORMAT_S16; case nir_type_uint16: return BI_REGISTER_FORMAT_U16; case nir_type_int32: return BI_REGISTER_FORMAT_S32; case nir_type_uint32: return BI_REGISTER_FORMAT_U32; default: unreachable("Invalid type for register format"); } } /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the * immediate to be used (which applies even if _IMM can't be used) */ static bool bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) { nir_src *offset = nir_get_io_offset_src(instr); if (!nir_src_is_const(*offset)) return false; *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); return (*immediate) < max; } static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src, unsigned *channel, unsigned count, unsigned bitsize); /* Bifrost's load instructions lack a component offset despite operating in * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, * but they may be unavoidable with separate shaders in use. To solve this, we * lower to a larger load and an explicit copy of the desired components. */ static void bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) { unsigned component = nir_intrinsic_component(instr); unsigned nr = instr->num_components; unsigned total = nr + component; unsigned bitsize = nir_dest_bit_size(instr->dest); assert(total <= 4 && "should be vec4"); bi_emit_cached_split(b, tmp, total * bitsize); if (component == 0) return; bi_index srcs[] = { tmp, tmp, tmp }; unsigned channels[] = { component, component + 1, component + 2 }; bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, nr, nir_dest_bit_size(instr->dest)); } static void bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) { nir_alu_type T = nir_intrinsic_dest_type(instr); enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); nir_src *offset = nir_get_io_offset_src(instr); unsigned component = nir_intrinsic_component(instr); enum bi_vecsize vecsize = (instr->num_components + component - 1); unsigned imm_index = 0; unsigned base = nir_intrinsic_base(instr); bool constant = nir_src_is_const(*offset); bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); bi_instr *I; if (immediate) { I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt, vecsize, imm_index); } else { bi_index idx = bi_src_index(&instr->src[0]); if (constant) idx = bi_imm_u32(imm_index); else if (base != 0) idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx, regfmt, vecsize); } if (b->shader->arch >= 9) I->table = PAN_TABLE_ATTRIBUTE; bi_copy_component(b, instr, dest); } /* * ABI: Special (desktop GL) slots come first, tightly packed. General varyings * come later, sparsely packed. This handles both linked and separable shaders * with a common code path, with minimal keying only for desktop GL. Each slot * consumes 16 bytes (TODO: fp16, partial vectors). */ static unsigned bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr) { nir_io_semantics sem = nir_intrinsic_io_semantics(intr); uint32_t mask = ctx->inputs->fixed_varying_mask; if (sem.location >= VARYING_SLOT_VAR0) { unsigned nr_special = util_bitcount(mask); unsigned general_index = (sem.location - VARYING_SLOT_VAR0); return 16 * (nr_special + general_index); } else { return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location))); } } /* * Compute the offset in bytes of a varying with an immediate offset, adding the * offset to the base computed above. Convenience method. */ static unsigned bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr) { nir_src *src = nir_get_io_offset_src(intr); assert(nir_src_is_const(*src) && "assumes immediate offset"); return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16); } static void bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) { enum bi_sample sample = BI_SAMPLE_CENTER; enum bi_update update = BI_UPDATE_STORE; enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; bi_index src0 = bi_null(); unsigned component = nir_intrinsic_component(instr); enum bi_vecsize vecsize = (instr->num_components + component - 1); bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); unsigned sz = nir_dest_bit_size(instr->dest); if (smooth) { nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); assert(parent); sample = bi_interp_for_intrinsic(parent->intrinsic); src0 = bi_varying_src0_for_barycentric(b, parent); assert(sz == 16 || sz == 32); regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32; } else { assert(sz == 32); regfmt = BI_REGISTER_FORMAT_U32; /* Valhall can't have bi_null() here, although the source is * logically unused for flat varyings */ if (b->shader->arch >= 9) src0 = bi_preload(b, 61); /* Gather info as we go */ b->shader->info.bifrost->uses_flat_shading = true; } enum bi_source_format source_format = smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; nir_src *offset = nir_get_io_offset_src(instr); unsigned imm_index = 0; bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); bi_instr *I = NULL; if (b->shader->malloc_idvs && immediate) { /* Immediate index given in bytes. */ bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, update, vecsize, bi_varying_offset(b->shader, instr)); } else if (immediate && smooth) { I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize, imm_index); } else if (immediate && !smooth) { I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize, imm_index); } else { bi_index idx = bi_src_index(offset); unsigned base = nir_intrinsic_base(instr); if (b->shader->malloc_idvs) { /* Index needs to be in bytes, but NIR gives the index * in slots. For now assume 16 bytes per element. */ bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4)); unsigned vbase = bi_varying_base_bytes(b->shader, instr); if (vbase != 0) idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false); bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample, source_format, update, vecsize); } else if (smooth) { if (base != 0) idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize); } else { if (base != 0) idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize); } } /* Valhall usually uses machine-allocated IDVS. If this is disabled, use * a simple Midgard-style ABI. */ if (b->shader->arch >= 9 && I != NULL) I->table = PAN_TABLE_ATTRIBUTE; bi_copy_component(b, instr, dest); } static void bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, unsigned count) { bi_index srcs[BI_MAX_VEC]; for (unsigned i = 0; i < count; i += 2) { bool next = (i + 1) < count; unsigned chan = channel ? channel[i] : 0; unsigned nextc = next && channel ? channel[i + 1] : 0; bi_index w0 = bi_extract(b, src[i], chan >> 1); bi_index w1 = next ? bi_extract(b, src[i + 1], nextc >> 1) : bi_zero(); bi_index h0 = bi_half(w0, chan & 1); bi_index h1 = bi_half(w1, nextc & 1); if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1)) srcs[i >> 1] = bi_mov_i32(b, w0); else if (bi_is_word_equiv(w0, w1)) srcs[i >> 1] = bi_swz_v2i16(b, bi_swz_16(w0, chan & 1, nextc & 1)); else srcs[i >> 1] = bi_mkvec_v2i16(b, h0, h1); } bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, 2)); } static void bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, unsigned count, unsigned bitsize) { if (bitsize == 32) { bi_index srcs[BI_MAX_VEC]; for (unsigned i = 0; i < count; ++i) srcs[i] = bi_extract(b, src[i], channel ? channel[i] : 0); bi_emit_collect_to(b, dst, srcs, count); } else if (bitsize == 16) { bi_make_vec16_to(b, dst, src, channel, count); } else if (bitsize == 8 && count == 1) { bi_swz_v4i8_to(b, dst, bi_byte( bi_extract(b, src[0], channel[0] >> 2), channel[0] & 3)); } else { unreachable("8-bit mkvec not yet supported"); } } static inline bi_instr * bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0, bi_index src1) { bi_instr *I; if (b->shader->arch >= 9) { I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1); I->seg = BI_SEG_UBO; } else { I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0); } bi_emit_cached_split(b, dest0, bitsize); return I; } static bi_instr * bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, unsigned nr_components, unsigned offset) { unsigned sysval_ubo = b->shader->inputs->fixed_sysval_ubo >= 0 ? b->shader->inputs->fixed_sysval_ubo : b->shader->nir->info.num_ubos; unsigned uniform = pan_lookup_sysval(b->shader->sysval_to_id, b->shader->info.sysvals, sysval); unsigned idx = (uniform * 16) + offset; return bi_load_ubo_to(b, nr_components * 32, dest, bi_imm_u32(idx), bi_imm_u32(sysval_ubo)); } static void bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, unsigned nr_components, unsigned offset) { bi_load_sysval_to(b, bi_dest_index(&intr->dest), panfrost_sysval_for_instr(&intr->instr, NULL), nr_components, offset); } static bi_index bi_load_sysval(bi_builder *b, int sysval, unsigned nr_components, unsigned offset) { bi_index tmp = bi_temp(b->shader); bi_load_sysval_to(b, tmp, sysval, nr_components, offset); return tmp; } static void bi_load_sample_id_to(bi_builder *b, bi_index dst) { /* r61[16:23] contains the sampleID, mask it out. Upper bits * seem to read garbage (despite being architecturally defined * as zero), so use a 5-bit mask instead of 8-bits */ bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), bi_imm_u8(16), false); } static bi_index bi_load_sample_id(bi_builder *b) { bi_index sample_id = bi_temp(b->shader); bi_load_sample_id_to(b, sample_id); return sample_id; } static bi_index bi_pixel_indices(bi_builder *b, unsigned rt) { /* We want to load the current pixel. */ struct bifrost_pixel_indices pix = { .y = BIFROST_CURRENT_PIXEL, .rt = rt }; uint32_t indices_u32 = 0; memcpy(&indices_u32, &pix, sizeof(indices_u32)); bi_index indices = bi_imm_u32(indices_u32); /* Sample index above is left as zero. For multisampling, we need to * fill in the actual sample ID in the lower byte */ if (b->shader->inputs->blend.nr_samples > 1) indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); return indices; } /* Source color is passed through r0-r3, or r4-r7 for the second source when * dual-source blending. Preload the corresponding vector. */ static void bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) { nir_io_semantics sem = nir_intrinsic_io_semantics(instr); unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0; unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); assert(size == 16 || size == 32); bi_index srcs[] = { bi_preload(b, base + 0), bi_preload(b, base + 1), bi_preload(b, base + 2), bi_preload(b, base + 3) }; bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2); } static void bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2, nir_alu_type T2, unsigned rt) { /* On Valhall, BLEND does not encode the return address */ bool bifrost = b->shader->arch <= 8; /* Reads 2 or 4 staging registers to cover the input */ unsigned size = nir_alu_type_get_type_size(T); unsigned size_2 = nir_alu_type_get_type_size(T2); unsigned sr_count = (size <= 16) ? 2 : 4; unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4; const struct panfrost_compile_inputs *inputs = b->shader->inputs; uint64_t blend_desc = inputs->blend.bifrost_blend_desc; enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); if (inputs->is_blend && inputs->blend.nr_samples > 1) { /* Conversion descriptor comes from the compile inputs, pixel * indices derived at run time based on sample ID */ bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b), bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4); } else if (b->shader->inputs->is_blend) { uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; /* Blend descriptor comes from the compile inputs */ /* Put the result in r0 */ bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba, bi_coverage(b), bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32), bi_null(), regfmt, sr_count, 0); } else { /* Blend descriptor comes from the FAU RAM. By convention, the * return address on Bifrost is stored in r48 and will be used * by the blend shader to jump back to the fragment shader */ bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba, bi_coverage(b), bi_fau(BIR_FAU_BLEND_0 + rt, false), bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count, sr_count_2); } assert(rt < 8); b->shader->info.bifrost->blend[rt].type = T; if (T2) b->shader->info.bifrost->blend_src1_type = T2; } /* Blend shaders do not need to run ATEST since they are dependent on a * fragment shader that runs it. Blit shaders may not need to run ATEST, since * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and * there are no writes to the coverage mask. The latter two are satisfied for * all blit shaders, so we just care about early-z, which blit shaders force * iff they do not write depth or stencil */ static bool bi_skip_atest(bi_context *ctx, bool emit_zs) { return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; } static void bi_emit_atest(bi_builder *b, bi_index alpha) { bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha); b->shader->emitted_atest = true; b->shader->coverage = atest->dest[0]; /* Pseudo-source to encode in the tuple */ atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false); } static void bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) { bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan; unsigned writeout = combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C; bool emit_blend = writeout & (PAN_WRITEOUT_C); bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); const nir_variable *var = nir_find_variable_with_driver_location(b->shader->nir, nir_var_shader_out, nir_intrinsic_base(instr)); unsigned loc = var ? var->data.location : 0; bi_index src0 = bi_src_index(&instr->src[0]); /* By ISA convention, the coverage mask is stored in R60. The store * itself will be handled by a subsequent ATEST instruction */ if (loc == FRAG_RESULT_SAMPLE_MASK) { bi_index orig = bi_coverage(b); bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0)); b->shader->coverage = bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO); return; } /* Emit ATEST if we have to, note ATEST requires a floating-point alpha * value, but render target #0 might not be floating point. However the * alpha value is only used for alpha-to-coverage, a stage which is * skipped for pure integer framebuffers, so the issue is moot. */ if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { nir_alu_type T = nir_intrinsic_src_type(instr); bi_index rgba = bi_src_index(&instr->src[0]); bi_index alpha = (T == nir_type_float16) ? bi_half(bi_extract(b, rgba, 1), true) : (T == nir_type_float32) ? bi_extract(b, rgba, 3) : bi_dontcare(b); /* Don't read out-of-bounds */ if (nir_src_num_components(instr->src[0]) < 4) alpha = bi_imm_f32(1.0); bi_emit_atest(b, alpha); } if (emit_zs) { bi_index z = bi_dontcare(b), s = bi_dontcare(b); if (writeout & PAN_WRITEOUT_Z) z = bi_src_index(&instr->src[2]); if (writeout & PAN_WRITEOUT_S) s = bi_src_index(&instr->src[3]); b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S, writeout & PAN_WRITEOUT_Z); } if (emit_blend) { unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0; bool dual = (writeout & PAN_WRITEOUT_2); bi_index color = bi_src_index(&instr->src[0]); bi_index color2 = dual ? bi_src_index(&instr->src[4]) : bi_null(); nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0; /* Explicit copy since BLEND inputs are precoloured to R0-R3, * TODO: maybe schedule around this or implement in RA as a * spill */ bool has_mrt = false; nir_foreach_shader_out_variable(var, b->shader->nir) has_mrt |= (var->data.location > FRAG_RESULT_DATA0); if (has_mrt) { bi_index srcs[4] = { color, color, color, color }; unsigned channels[4] = { 0, 1, 2, 3 }; color = bi_temp(b->shader); bi_make_vec_to(b, color, srcs, channels, nir_src_num_components(instr->src[0]), nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); } bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt); } if (b->shader->inputs->is_blend) { /* Jump back to the fragment shader, return address is stored * in r48 (see above). On Valhall, only jump if the address is * nonzero. The check is free there and it implements the "jump * to 0 terminates the blend shader" that's automatic on * Bifrost. */ if (b->shader->arch >= 8) bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); else bi_jump(b, bi_preload(b, 48)); } } /** * In a vertex shader, is the specified variable a position output? These kinds * of outputs are written from position shaders when IDVS is enabled. All other * outputs are written from the varying shader. */ static bool bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs) { nir_io_semantics sem = nir_intrinsic_io_semantics(intr); switch (sem.location) { case VARYING_SLOT_POS: case VARYING_SLOT_PSIZ: return idvs == BI_IDVS_VARYING; default: return idvs == BI_IDVS_POSITION; } } static bool bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data) { enum bi_idvs_mode *idvs = data; if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) return false; if (bi_should_remove_store(intr, *idvs)) { nir_instr_remove(instr); return true; } return false; } static void bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) { /* In principle we can do better for 16-bit. At the moment we require * 32-bit to permit the use of .auto, in order to force .u32 for flat * varyings, to handle internal TGSI shaders that set flat in the VS * but smooth in the FS */ ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); ASSERTED unsigned T_size = nir_alu_type_get_type_size(T); assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16)); enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; unsigned imm_index = 0; bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); /* Only look at the total components needed. In effect, we fill in all * the intermediate "holes" in the write mask, since we can't mask off * stores. Since nir_lower_io_to_temporaries ensures each varying is * written at most once, anything that's masked out is undefined, so it * doesn't matter what we write there. So we may as well do the * simplest thing possible. */ unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); bi_index data = bi_src_index(&instr->src[0]); /* To keep the vector dimensions consistent, we need to drop some * components. This should be coalesced. * * TODO: This is ugly and maybe inefficient. Would we rather * introduce a TRIM.i32 pseudoinstruction? */ if (nr < nir_intrinsic_src_components(instr, 0)) { assert(T_size == 32 && "todo: 16-bit trim"); bi_instr *split = bi_split_i32_to(b, bi_null(), data); split->nr_dests = nir_intrinsic_src_components(instr, 0); bi_index tmp = bi_temp(b->shader); bi_instr *collect = bi_collect_i32_to(b, tmp); collect->nr_srcs = nr; for (unsigned w = 0; w < nr; ++w) { split->dest[w] = bi_temp(b->shader); collect->src[w] = split->dest[w]; } data = tmp; } bool psiz = (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ); bi_index a[4] = { bi_null() }; if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) { /* Bifrost position shaders have a fast path */ assert(T == nir_type_float16 || T == nir_type_float32); unsigned regfmt = (T == nir_type_float16) ? 0 : 1; unsigned identity = (b->shader->arch == 6) ? 0x688 : 0; unsigned snap4 = 0x5E; uint32_t format = identity | (snap4 << 12) | (regfmt << 24); bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59), bi_imm_u32(format), regfmt, nr - 1); } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) { bi_index index = bi_preload(b, 59); if (psiz) { assert(T_size == 16 && "should've been lowered"); index = bi_iadd_imm_i32(b, index, 4); } bi_index address = bi_lea_buf_imm(b, index); bi_emit_split_i32(b, a, address, 2); bool varying = (b->shader->idvs == BI_IDVS_VARYING); bi_store(b, nr * nir_src_bit_size(instr->src[0]), data, a[0], a[1], varying ? BI_SEG_VARY : BI_SEG_POS, varying ? bi_varying_offset(b->shader, instr) : 0); } else if (immediate) { bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b), regfmt, imm_index); bi_emit_split_i32(b, a, address, 3); bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); } else { bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)), bi_imm_u32(nir_intrinsic_base(instr)), false); bi_index address = bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt); bi_emit_split_i32(b, a, address, 3); bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1); } } static void bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) { nir_src *offset = nir_get_io_offset_src(instr); bool offset_is_const = nir_src_is_const(*offset); bi_index dyn_offset = bi_src_index(offset); uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); bi_load_ubo_to(b, instr->num_components * nir_dest_bit_size(instr->dest), bi_dest_index(&instr->dest), offset_is_const ? bi_imm_u32(const_offset) : dyn_offset, kernel_input ? bi_zero() : bi_src_index(&instr->src[0])); } static void bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr) { assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms"); nir_src *offset = &instr->src[0]; assert(nir_src_is_const(*offset) && "no indirect push constants"); uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); assert((base & 3) == 0 && "unaligned push constants"); unsigned bits = nir_dest_bit_size(instr->dest) * nir_dest_num_components(instr->dest); unsigned n = DIV_ROUND_UP(bits, 32); assert(n <= 4); bi_index channels[4] = { bi_null() }; for (unsigned i = 0; i < n; ++i) { unsigned word = (base >> 2) + i; channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1); } bi_emit_collect_to(b, bi_dest_index(&instr->dest), channels, n); } static bi_index bi_addr_high(bi_builder *b, nir_src *src) { return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1) : bi_zero(); } static void bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, enum bi_seg seg, int16_t *offset) { /* Not needed on Bifrost or for global accesses */ if (b->shader->arch < 9 || seg == BI_SEG_NONE) return; /* There is no segment modifier on Valhall. Instead, we need to * emit the arithmetic ourselves. We do have an offset * available, which saves an instruction for constant offsets. */ bool wls = (seg == BI_SEG_WLS); assert(wls || (seg == BI_SEG_TL)); enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR; bi_index base_lo = bi_fau(fau, false); if (offset && addr_lo->type == BI_INDEX_CONSTANT && addr_lo->value == (int16_t) addr_lo->value) { *offset = addr_lo->value; *addr_lo = base_lo; } else { *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false); } /* Do not allow overflow for WLS or TLS */ *addr_hi = bi_fau(fau, true); } static void bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) { int16_t offset = 0; unsigned bits = instr->num_components * nir_dest_bit_size(instr->dest); bi_index dest = bi_dest_index(&instr->dest); bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0); bi_index addr_hi = bi_addr_high(b, &instr->src[0]); bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); bi_emit_cached_split(b, dest, bits); } static void bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) { /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ assert(nir_intrinsic_write_mask(instr) == BITFIELD_MASK(instr->num_components)); int16_t offset = 0; bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0); bi_index addr_hi = bi_addr_high(b, &instr->src[1]); bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset); } /* Exchanges the staging register with memory */ static void bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) { assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); unsigned sz = nir_src_bit_size(*arg); assert(sz == 32 || sz == 64); bi_index data = bi_src_index(arg); bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); if (b->shader->arch >= 9) bi_handle_segment(b, &addr, &addr_hi, seg, NULL); else if (seg == BI_SEG_WLS) addr_hi = bi_zero(); bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg); } /* Exchanges the second staging register with memory if comparison with first * staging register passes */ static void bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) { assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); /* hardware is swapped from NIR */ bi_index src0 = bi_src_index(arg_2); bi_index src1 = bi_src_index(arg_1); unsigned sz = nir_src_bit_size(*arg_1); assert(sz == 32 || sz == 64); bi_index data_words[] = { bi_extract(b, src0, 0), sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1), /* 64-bit */ bi_extract(b, src1, 0), sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1), }; bi_index in = bi_temp(b->shader); bi_emit_collect_to(b, in, data_words, 2 * (sz / 32)); bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1); if (b->shader->arch >= 9) bi_handle_segment(b, &addr, &addr_hi, seg, NULL); else if (seg == BI_SEG_WLS) addr_hi = bi_zero(); bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg); bi_emit_cached_split(b, out, sz); bi_index inout_words[] = { bi_extract(b, out, 0), sz == 64 ? bi_extract(b, out, 1) : bi_null() }; bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); } /* Extracts an atomic opcode */ static enum bi_atom_opc bi_atom_opc_for_nir(nir_intrinsic_op op) { switch (op) { case nir_intrinsic_global_atomic_add: case nir_intrinsic_shared_atomic_add: case nir_intrinsic_image_atomic_add: return BI_ATOM_OPC_AADD; case nir_intrinsic_global_atomic_imin: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_image_atomic_imin: return BI_ATOM_OPC_ASMIN; case nir_intrinsic_global_atomic_umin: case nir_intrinsic_shared_atomic_umin: case nir_intrinsic_image_atomic_umin: return BI_ATOM_OPC_AUMIN; case nir_intrinsic_global_atomic_imax: case nir_intrinsic_shared_atomic_imax: case nir_intrinsic_image_atomic_imax: return BI_ATOM_OPC_ASMAX; case nir_intrinsic_global_atomic_umax: case nir_intrinsic_shared_atomic_umax: case nir_intrinsic_image_atomic_umax: return BI_ATOM_OPC_AUMAX; case nir_intrinsic_global_atomic_and: case nir_intrinsic_shared_atomic_and: case nir_intrinsic_image_atomic_and: return BI_ATOM_OPC_AAND; case nir_intrinsic_global_atomic_or: case nir_intrinsic_shared_atomic_or: case nir_intrinsic_image_atomic_or: return BI_ATOM_OPC_AOR; case nir_intrinsic_global_atomic_xor: case nir_intrinsic_shared_atomic_xor: case nir_intrinsic_image_atomic_xor: return BI_ATOM_OPC_AXOR; default: unreachable("Unexpected computational atomic"); } } /* Optimized unary atomics are available with an implied #1 argument */ static bool bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) { /* Check we have a compatible constant */ if (arg.type != BI_INDEX_CONSTANT) return false; if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) return false; /* Check for a compatible operation */ switch (op) { case BI_ATOM_OPC_AADD: *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; return true; case BI_ATOM_OPC_ASMAX: *out = BI_ATOM_OPC_ASMAX1; return true; case BI_ATOM_OPC_AUMAX: *out = BI_ATOM_OPC_AUMAX1; return true; case BI_ATOM_OPC_AOR: *out = BI_ATOM_OPC_AOR1; return true; default: return false; } } /* * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to * translate between these forms (with MKVEC.v2i16). * * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D * arrays. For uniform handling, we also treat 3D textures like 2D arrays. * * Our indexing needs to reflects this. */ static bi_index bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, unsigned coord_comps, bool is_array) { assert(coord_comps > 0 && coord_comps <= 3); if (src_idx == 0) { if (coord_comps == 1 || (coord_comps == 2 && is_array)) return bi_extract(b, coord, 0); else return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false), bi_half(bi_extract(b, coord, 1), false)); } else { if (coord_comps == 3 && b->shader->arch >= 9) return bi_mkvec_v2i16(b, bi_imm_u16(0), bi_half(bi_extract(b, coord, 2), false)); else if (coord_comps == 3) return bi_extract(b, coord, 2); else if (coord_comps == 2 && is_array) return bi_extract(b, coord, 1); else return bi_zero(); } } static bi_index bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) { nir_src src = instr->src[0]; bi_index index = bi_src_index(&src); bi_context *ctx = b->shader; /* Images come after vertex attributes, so handle an explicit offset */ unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? util_bitcount64(ctx->nir->info.inputs_read) : 0; if (offset == 0) return index; else if (nir_src_is_const(src)) return bi_imm_u32(nir_src_as_uint(src) + offset); else return bi_iadd_u32(b, index, bi_imm_u32(offset), false); } static void bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) { enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); unsigned coord_comps = nir_image_intrinsic_coord_components(instr); bool array = nir_intrinsic_image_array(instr); ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); bi_index coords = bi_src_index(&instr->src[1]); bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); bi_index dest = bi_dest_index(&instr->dest); enum bi_register_format regfmt = bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)); enum bi_vecsize vecsize = instr->num_components - 1; /* TODO: MSAA */ assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { bi_instr *I = bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, nir_src_as_uint(instr->src[0])); I->table = PAN_TABLE_IMAGE; } else if (b->shader->arch >= 9) { unreachable("Indirect images on Valhall not yet supported"); } else { bi_ld_attr_tex_to(b, dest, xy, zw, bi_emit_image_index(b, instr), regfmt, vecsize); } bi_split_dest(b, instr->dest); } static bi_index bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) { enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); bool array = nir_intrinsic_image_array(instr); ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); unsigned coord_comps = nir_image_intrinsic_coord_components(instr); /* TODO: MSAA */ assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : BI_REGISTER_FORMAT_AUTO; bi_index coords = bi_src_index(&instr->src[1]); bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); bi_index dest = bi_temp(b->shader); if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) { bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, nir_src_as_uint(instr->src[0])); I->table = PAN_TABLE_IMAGE; } else if (b->shader->arch >= 9) { unreachable("Indirect images on Valhall not yet supported"); } else { bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw, bi_emit_image_index(b, instr), type); /* LEA_ATTR_TEX defaults to the secondary attribute table, but * our ABI has all images in the primary attribute table */ I->table = BI_TABLE_ATTRIBUTE_1; } bi_emit_cached_split(b, dest, 3 * 32); return dest; } static void bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) { bi_index a[4] = { bi_null() }; bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3); bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2], bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)), instr->num_components - 1); } static void bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) { enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); enum bi_atom_opc post_opc = opc; bool bifrost = b->shader->arch <= 8; /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't * take any vector but can still output in RETURN mode */ bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst; unsigned sr_count = bifrost ? 2 : 1; /* Generate either ATOM or ATOM1 as required */ if (bi_promote_atom_c1(opc, arg, &opc)) { bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0), bi_extract(b, addr, 1), opc, sr_count); } else { bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0), bi_extract(b, addr, 1), opc, sr_count); } if (bifrost) { /* Post-process it */ bi_emit_cached_split_i32(b, tmp_dest, 2); bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0), bi_extract(b, tmp_dest, 1), post_opc); } } /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 * gl_FragCoord.z = ld_vary(fragz) * gl_FragCoord.w = ld_vary(fragw) */ static void bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) { bi_index src[4] = {}; for (unsigned i = 0; i < 2; ++i) { src[i] = bi_fadd_f32(b, bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)), bi_imm_f32(0.5f)); } for (unsigned i = 0; i < 2; ++i) { src[2 + i] = bi_ld_var_special(b, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, (i == 0) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W, BI_VECSIZE_NONE); } bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); } static void bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) { bi_index dest = bi_dest_index(&instr->dest); nir_alu_type T = nir_intrinsic_dest_type(instr); enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); unsigned rt = b->shader->inputs->blend.rt; unsigned size = nir_dest_bit_size(instr->dest); unsigned nr = instr->num_components; /* Get the render target */ if (!b->shader->inputs->is_blend) { const nir_variable *var = nir_find_variable_with_driver_location(b->shader->nir, nir_var_shader_out, nir_intrinsic_base(instr)); unsigned loc = var->data.location; assert(loc >= FRAG_RESULT_DATA0); rt = (loc - FRAG_RESULT_DATA0); } bi_index desc = b->shader->inputs->is_blend ? bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : b->shader->inputs->bifrost.static_rt_conv ? bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc, regfmt, nr - 1); bi_emit_cached_split(b, dest, size * nr); } static void bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) { bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? bi_dest_index(&instr->dest) : bi_null(); gl_shader_stage stage = b->shader->stage; switch (instr->intrinsic) { case nir_intrinsic_load_barycentric_pixel: case nir_intrinsic_load_barycentric_centroid: case nir_intrinsic_load_barycentric_sample: case nir_intrinsic_load_barycentric_at_sample: case nir_intrinsic_load_barycentric_at_offset: /* handled later via load_vary */ break; case nir_intrinsic_load_interpolated_input: case nir_intrinsic_load_input: if (b->shader->inputs->is_blend) bi_emit_load_blend_input(b, instr); else if (stage == MESA_SHADER_FRAGMENT) bi_emit_load_vary(b, instr); else if (stage == MESA_SHADER_VERTEX) bi_emit_load_attr(b, instr); else unreachable("Unsupported shader stage"); break; case nir_intrinsic_store_output: if (stage == MESA_SHADER_FRAGMENT) bi_emit_fragment_out(b, instr); else if (stage == MESA_SHADER_VERTEX) bi_emit_store_vary(b, instr); else unreachable("Unsupported shader stage"); break; case nir_intrinsic_store_combined_output_pan: assert(stage == MESA_SHADER_FRAGMENT); bi_emit_fragment_out(b, instr); break; case nir_intrinsic_load_ubo: case nir_intrinsic_load_kernel_input: bi_emit_load_ubo(b, instr); break; case nir_intrinsic_load_push_constant: bi_emit_load_push_constant(b, instr); break; case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: bi_emit_load(b, instr, BI_SEG_NONE); break; case nir_intrinsic_store_global: bi_emit_store(b, instr, BI_SEG_NONE); break; case nir_intrinsic_load_scratch: bi_emit_load(b, instr, BI_SEG_TL); break; case nir_intrinsic_store_scratch: bi_emit_store(b, instr, BI_SEG_TL); break; case nir_intrinsic_load_shared: bi_emit_load(b, instr, BI_SEG_WLS); break; case nir_intrinsic_store_shared: bi_emit_store(b, instr, BI_SEG_WLS); break; /* Blob doesn't seem to do anything for memory barriers, note +BARRIER * is illegal in fragment shaders */ case nir_intrinsic_memory_barrier: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: case nir_intrinsic_group_memory_barrier: break; case nir_intrinsic_control_barrier: assert(b->shader->stage != MESA_SHADER_FRAGMENT); bi_barrier(b); break; case nir_intrinsic_shared_atomic_add: case nir_intrinsic_shared_atomic_imin: case nir_intrinsic_shared_atomic_umin: case nir_intrinsic_shared_atomic_imax: case nir_intrinsic_shared_atomic_umax: case nir_intrinsic_shared_atomic_and: case nir_intrinsic_shared_atomic_or: case nir_intrinsic_shared_atomic_xor: { assert(nir_src_bit_size(instr->src[1]) == 32); bi_index addr = bi_src_index(&instr->src[0]); bi_index addr_hi; if (b->shader->arch >= 9) { bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL); addr = bi_collect_v2i32(b, addr, addr_hi); } else { addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS); bi_emit_cached_split(b, addr, 64); } bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), instr->intrinsic); bi_split_dest(b, instr->dest); break; } case nir_intrinsic_image_atomic_add: case nir_intrinsic_image_atomic_imin: case nir_intrinsic_image_atomic_umin: case nir_intrinsic_image_atomic_imax: case nir_intrinsic_image_atomic_umax: case nir_intrinsic_image_atomic_and: case nir_intrinsic_image_atomic_or: case nir_intrinsic_image_atomic_xor: assert(nir_src_bit_size(instr->src[3]) == 32); bi_emit_atomic_i32_to(b, dst, bi_emit_lea_image(b, instr), bi_src_index(&instr->src[3]), instr->intrinsic); bi_split_dest(b, instr->dest); break; case nir_intrinsic_global_atomic_add: case nir_intrinsic_global_atomic_imin: case nir_intrinsic_global_atomic_umin: case nir_intrinsic_global_atomic_imax: case nir_intrinsic_global_atomic_umax: case nir_intrinsic_global_atomic_and: case nir_intrinsic_global_atomic_or: case nir_intrinsic_global_atomic_xor: assert(nir_src_bit_size(instr->src[1]) == 32); bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]), bi_src_index(&instr->src[1]), instr->intrinsic); bi_split_dest(b, instr->dest); break; case nir_intrinsic_image_load: bi_emit_image_load(b, instr); break; case nir_intrinsic_image_store: bi_emit_image_store(b, instr); break; case nir_intrinsic_global_atomic_exchange: bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], BI_SEG_NONE); bi_split_dest(b, instr->dest); break; case nir_intrinsic_image_atomic_exchange: bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), &instr->src[3], BI_SEG_NONE); bi_split_dest(b, instr->dest); break; case nir_intrinsic_shared_atomic_exchange: bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], BI_SEG_WLS); bi_split_dest(b, instr->dest); break; case nir_intrinsic_global_atomic_comp_swap: bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], &instr->src[2], BI_SEG_NONE); bi_split_dest(b, instr->dest); break; case nir_intrinsic_image_atomic_comp_swap: bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), &instr->src[3], &instr->src[4], BI_SEG_NONE); bi_split_dest(b, instr->dest); break; case nir_intrinsic_shared_atomic_comp_swap: bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1], &instr->src[2], BI_SEG_WLS); bi_split_dest(b, instr->dest); break; case nir_intrinsic_load_frag_coord: bi_emit_load_frag_coord(b, instr); break; case nir_intrinsic_load_output: bi_emit_ld_tile(b, instr); break; case nir_intrinsic_discard_if: bi_discard_b32(b, bi_src_index(&instr->src[0])); break; case nir_intrinsic_discard: bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); break; case nir_intrinsic_load_ssbo_address: case nir_intrinsic_load_xfb_address: bi_load_sysval_nir(b, instr, 2, 0); break; case nir_intrinsic_load_work_dim: case nir_intrinsic_load_num_vertices: bi_load_sysval_nir(b, instr, 1, 0); break; case nir_intrinsic_load_first_vertex: bi_load_sysval_nir(b, instr, 1, 0); break; case nir_intrinsic_load_base_vertex: bi_load_sysval_nir(b, instr, 1, 4); break; case nir_intrinsic_load_base_instance: bi_load_sysval_nir(b, instr, 1, 8); break; case nir_intrinsic_load_draw_id: bi_load_sysval_nir(b, instr, 1, 0); break; case nir_intrinsic_get_ssbo_size: bi_load_sysval_nir(b, instr, 1, 8); break; case nir_intrinsic_load_viewport_scale: case nir_intrinsic_load_viewport_offset: case nir_intrinsic_load_num_workgroups: case nir_intrinsic_load_workgroup_size: bi_load_sysval_nir(b, instr, 3, 0); break; case nir_intrinsic_image_size: bi_load_sysval_nir(b, instr, nir_dest_num_components(instr->dest), 0); break; case nir_intrinsic_load_blend_const_color_rgba: bi_load_sysval_nir(b, instr, nir_dest_num_components(instr->dest), 0); break; case nir_intrinsic_load_sample_positions_pan: bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); break; case nir_intrinsic_load_sample_mask_in: /* r61[0:15] contains the coverage bitmap */ bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); break; case nir_intrinsic_load_sample_id: bi_load_sample_id_to(b, dst); break; case nir_intrinsic_load_front_face: /* r58 == 0 means primitive is front facing */ bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ, BI_RESULT_TYPE_M1); break; case nir_intrinsic_load_point_coord: bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, BI_VARYING_NAME_POINT, BI_VECSIZE_V2); bi_emit_cached_split_i32(b, dst, 2); break; /* It appears vertex_id is zero-based with Bifrost geometry flows, but * not with Valhall's memory-allocation IDVS geometry flow. Ostensibly * we support the legacy geometry flow even on Valhall, so * vertex_id_zero_based isn't a machine property for us. Don't set it, * and lower here if needed. */ case nir_intrinsic_load_vertex_id: if (b->shader->malloc_idvs) { bi_mov_i32_to(b, dst, bi_vertex_id(b)); } else { bi_index first = bi_load_sysval(b, PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS, 1, 0); bi_iadd_u32_to(b, dst, bi_vertex_id(b), first, false); } break; /* We only use in our transform feedback lowering */ case nir_intrinsic_load_vertex_id_zero_base: assert(b->shader->nir->info.has_transform_feedback_varyings); bi_mov_i32_to(b, dst, bi_vertex_id(b)); break; case nir_intrinsic_load_instance_id: bi_mov_i32_to(b, dst, bi_instance_id(b)); break; case nir_intrinsic_load_subgroup_invocation: bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); break; case nir_intrinsic_load_local_invocation_id: bi_collect_v3i32_to(b, dst, bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); break; case nir_intrinsic_load_workgroup_id: bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), bi_preload(b, 59)); break; case nir_intrinsic_load_global_invocation_id: case nir_intrinsic_load_global_invocation_id_zero_base: bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), bi_preload(b, 62)); break; case nir_intrinsic_shader_clock: bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); bi_split_dest(b, instr->dest); break; default: fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); assert(0); } } static void bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) { /* Make sure we've been lowered */ assert(instr->def.num_components <= (32 / instr->def.bit_size)); /* Accumulate all the channels of the constant, as if we did an * implicit SEL over them */ uint32_t acc = 0; for (unsigned i = 0; i < instr->def.num_components; ++i) { unsigned v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); acc |= (v << (i * instr->def.bit_size)); } bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc)); } static bi_index bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps) { /* we don't lower modifiers until the backend */ assert(!(src.negate || src.abs)); unsigned bitsize = nir_src_bit_size(src.src); /* the bi_index carries the 32-bit (word) offset separate from the * subword swizzle, first handle the offset */ unsigned offset = 0; assert(bitsize == 8 || bitsize == 16 || bitsize == 32); unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; for (unsigned i = 0; i < comps; ++i) { unsigned new_offset = (src.swizzle[i] >> subword_shift); if (i > 0) assert(offset == new_offset && "wrong vectorization"); offset = new_offset; } bi_index idx = bi_extract(b, bi_src_index(&src.src), offset); /* Compose the subword swizzle with existing (identity) swizzle */ assert(idx.swizzle == BI_SWIZZLE_H01); /* Bigger vectors should have been lowered */ assert(comps <= (1 << subword_shift)); if (bitsize == 16) { unsigned c0 = src.swizzle[0] & 1; unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); } else if (bitsize == 8) { /* 8-bit vectors not yet supported */ assert(comps == 1 && "8-bit vectors not supported"); assert(src.swizzle[0] < 4 && "8-bit vectors not supported"); idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0]; } return idx; } static enum bi_round bi_nir_round(nir_op op) { switch (op) { case nir_op_fround_even: return BI_ROUND_NONE; case nir_op_ftrunc: return BI_ROUND_RTZ; case nir_op_fceil: return BI_ROUND_RTP; case nir_op_ffloor: return BI_ROUND_RTN; default: unreachable("invalid nir round op"); } } /* Convenience for lowered transcendentals */ static bi_index bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) { return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f)); } /* Approximate with FRCP_APPROX.f32 and apply a single iteration of * Newton-Raphson to improve precision */ static void bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) { bi_index x1 = bi_frcp_approx_f32(b, s0); bi_index m = bi_frexpm_f32(b, s0, false, false); bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(), BI_SPECIAL_N); bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE); } static void bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) { bi_index x1 = bi_frsq_approx_f32(b, s0); bi_index m = bi_frexpm_f32(b, s0, false, true); bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); bi_index t1 = bi_fmul_f32(b, x1, x1); bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), bi_imm_u32(-1), BI_SPECIAL_N); bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N); } /* More complex transcendentals, see * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc * for documentation */ static void bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) { bi_index t1 = bi_temp(b->shader); bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000)); t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000)); bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2)); a2->clamp = BI_CLAMP_CLAMP_M1_1; bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), bi_imm_u32(0x3e75fffa)); bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218)); bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i, BI_SPECIAL_NONE); x->clamp = BI_CLAMP_CLAMP_0_INF; bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); max->sem = BI_SEM_NAN_PROPAGATE; } static void bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) { /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 * fixed-point input */ bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), bi_imm_u32(24), BI_SPECIAL_NONE); bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale); fixed_pt->round = BI_ROUND_NONE; // XXX /* Compute the result for the fixed-point input, but pass along * the floating-point scale for correct NaN propagation */ bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale); } static void bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) { /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ bi_index a1 = bi_frexpm_f32(b, s0, true, false); bi_index ei = bi_frexpe_f32(b, s0, true, false); bi_index ef = bi_s32_to_f32(b, ei); /* xt estimates -log(r1), a coarse approximation of log(a1) */ bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), * and then log(s0) = x1 + x2 */ bi_index x1 = bi_fadd_f32(b, ef, xt); /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by * polynomial approximation around 1. The series is expressed around * 1, so set y = (a1 * r1) - 1.0 */ bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0)); /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate * log_e(1 + y) by the Taylor series (lower precision than the blob): * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ bi_index loge = bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0))); bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); /* log(s0) = x1 + x2 */ bi_fadd_f32_to(b, dst, x1, x2); } static void bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) { bi_index frexp = bi_frexpe_f32(b, s0, true, false); bi_index frexpi = bi_s32_to_f32(b, frexp); bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi); } static void bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) { bi_index log2_base = bi_null(); if (base.type == BI_INDEX_CONSTANT) { log2_base = bi_imm_f32(log2f(uif(base.value))); } else { log2_base = bi_temp(b->shader); bi_lower_flog2_32(b, log2_base, base); } return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); } static void bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) { bi_index log2_base = bi_null(); if (base.type == BI_INDEX_CONSTANT) { log2_base = bi_imm_f32(log2f(uif(base.value))); } else { log2_base = bi_temp(b->shader); bi_flog2_32(b, log2_base, base); } return bi_fexp_32(b, dst, exp, log2_base); } /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and * calculates the results. We use them to calculate sin/cos via a Taylor * approximation: * * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x) * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x) * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x) */ #define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f) #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0) #define SINCOS_BIAS bi_imm_u32(0x49400000) static void bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) { /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS); /* Approximate domain error (small) */ bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)), MPI_OVER_TWO, s0); /* Lookup sin(x), cos(x) */ bi_index sinx = bi_fsin_table_u6(b, x_u6, false); bi_index cosx = bi_fcos_table_u6(b, x_u6, false); /* e^2 / 2 */ bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE); /* (-e^2)/2 f''(x) */ bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero()); /* e f'(x) - (e^2/2) f''(x) */ bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, cos ? bi_neg(sinx) : cosx, quadratic); I->clamp = BI_CLAMP_CLAMP_M1_1; /* f(x) + e f'(x) - (e^2/2) f''(x) */ bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx); } /* * The XOR lane op is useful for derivative calculations, but not all Bifrost * implementations have it. Add a safe helper that uses the hardware * functionality when available and lowers where unavailable. */ static bi_index bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) { if (!(b->shader->quirks & BIFROST_LIMITED_CLPER)) { return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, BI_SUBGROUP_SUBGROUP4); } bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); return bi_clper_old_i32(b, s0, lane); } static enum bi_cmpf bi_translate_cmpf(nir_op op) { switch (op) { case nir_op_ieq8: case nir_op_ieq16: case nir_op_ieq32: case nir_op_feq16: case nir_op_feq32: return BI_CMPF_EQ; case nir_op_ine8: case nir_op_ine16: case nir_op_ine32: case nir_op_fneu16: case nir_op_fneu32: return BI_CMPF_NE; case nir_op_ilt8: case nir_op_ilt16: case nir_op_ilt32: case nir_op_flt16: case nir_op_flt32: case nir_op_ult8: case nir_op_ult16: case nir_op_ult32: return BI_CMPF_LT; case nir_op_ige8: case nir_op_ige16: case nir_op_ige32: case nir_op_fge16: case nir_op_fge32: case nir_op_uge8: case nir_op_uge16: case nir_op_uge32: return BI_CMPF_GE; default: unreachable("invalid comparison"); } } static bool bi_nir_is_replicated(nir_alu_src *src) { for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) { if (src->swizzle[0] == src->swizzle[i]) return false; } return true; } static void bi_emit_alu(bi_builder *b, nir_alu_instr *instr) { bi_index dst = bi_dest_index(&instr->dest.dest); unsigned srcs = nir_op_infos[instr->op].num_inputs; unsigned sz = nir_dest_bit_size(instr->dest.dest); unsigned comps = nir_dest_num_components(instr->dest.dest); unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; /* Indicate scalarness */ if (sz == 16 && comps == 1) dst.swizzle = BI_SWIZZLE_H00; if (!instr->dest.dest.is_ssa) { for (unsigned i = 0; i < comps; ++i) assert(instr->dest.write_mask); } /* First, match against the various moves in NIR. These are * special-cased because they can operate on vectors even after * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the * instruction is no "bigger" than SIMD-within-a-register. These moves * are the exceptions that need to handle swizzles specially. */ switch (instr->op) { case nir_op_vec2: case nir_op_vec3: case nir_op_vec4: { bi_index unoffset_srcs[4] = { srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(), srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(), srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(), srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(), }; unsigned channels[4] = { instr->src[0].swizzle[0], instr->src[1].swizzle[0], srcs > 2 ? instr->src[2].swizzle[0] : 0, srcs > 3 ? instr->src[3].swizzle[0] : 0, }; bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); return; } case nir_op_vec8: case nir_op_vec16: unreachable("should've been lowered"); case nir_op_unpack_32_2x16: { /* Should have been scalarized */ assert(comps == 2 && sz == 16); bi_index vec = bi_src_index(&instr->src[0].src); unsigned chan = instr->src[0].swizzle[0]; bi_mov_i32_to(b, dst, bi_extract(b, vec, chan)); return; } case nir_op_unpack_64_2x32_split_x: bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 0)); return; case nir_op_unpack_64_2x32_split_y: bi_mov_i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 1)); return; case nir_op_pack_64_2x32_split: bi_collect_v2i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), instr->src[0].swizzle[0]), bi_extract(b, bi_src_index(&instr->src[1].src), instr->src[1].swizzle[0])); return; case nir_op_pack_64_2x32: bi_collect_v2i32_to(b, dst, bi_extract(b, bi_src_index(&instr->src[0].src), 0), bi_extract(b, bi_src_index(&instr->src[0].src), 1)); return; case nir_op_pack_uvec2_to_uint: { bi_index src = bi_src_index(&instr->src[0].src); assert(sz == 32 && src_sz == 32); bi_mkvec_v2i16_to(b, dst, bi_half(bi_extract(b, src, 0), false), bi_half(bi_extract(b, src, 1), false)); return; } case nir_op_pack_uvec4_to_uint: { bi_index src = bi_src_index(&instr->src[0].src); assert(sz == 32 && src_sz == 32); bi_mkvec_v4i8_to(b, dst, bi_byte(bi_extract(b, src, 0), 0), bi_byte(bi_extract(b, src, 1), 0), bi_byte(bi_extract(b, src, 2), 0), bi_byte(bi_extract(b, src, 3), 0)); return; } case nir_op_mov: { bi_index idx = bi_src_index(&instr->src[0].src); bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; unsigned channels[4] = { comps > 0 ? instr->src[0].swizzle[0] : 0, comps > 1 ? instr->src[0].swizzle[1] : 0, comps > 2 ? instr->src[0].swizzle[2] : 0, comps > 3 ? instr->src[0].swizzle[3] : 0, }; bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz); return; } case nir_op_pack_32_2x16: { assert(nir_src_num_components(instr->src[0].src) == 2); assert(comps == 1); bi_index idx = bi_src_index(&instr->src[0].src); bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; unsigned channels[2] = { instr->src[0].swizzle[0], instr->src[0].swizzle[1] }; bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16); return; } case nir_op_f2f16: case nir_op_f2f16_rtz: case nir_op_f2f16_rtne: { assert(src_sz == 32); bi_index idx = bi_src_index(&instr->src[0].src); bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); bi_index s1 = comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0; bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1); /* Override rounding if explicitly requested. Otherwise, the * default rounding mode is selected by the builder. Depending * on the float controls required by the shader, the default * mode may not be nearest-even. */ if (instr->op == nir_op_f2f16_rtz) I->round = BI_ROUND_RTZ; else if (instr->op == nir_op_f2f16_rtne) I->round = BI_ROUND_NONE; /* Nearest even */ return; } /* Vectorized downcasts */ case nir_op_u2u16: case nir_op_i2i16: { if (!(src_sz == 32 && comps == 2)) break; bi_index idx = bi_src_index(&instr->src[0].src); bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]); bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false)); return; } /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than * scalarizing due to scheduling (equal cost on Valhall). Additionally * if the source is replicated the MKVEC.v2i16 can be optimized out. */ case nir_op_u2f16: case nir_op_i2f16: { if (!(src_sz == 32 && comps == 2)) break; nir_alu_src *src = &instr->src[0]; bi_index idx = bi_src_index(&src->src); bi_index s0 = bi_extract(b, idx, src->swizzle[0]); bi_index s1 = bi_extract(b, idx, src->swizzle[1]); bi_index t = (src->swizzle[0] == src->swizzle[1]) ? bi_half(s0, false) : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false)); if (instr->op == nir_op_u2f16) bi_v2u16_to_v2f16_to(b, dst, t); else bi_v2s16_to_v2f16_to(b, dst, t); return; } case nir_op_i2i8: case nir_op_u2u8: { /* Acts like an 8-bit swizzle */ bi_index idx = bi_src_index(&instr->src[0].src); unsigned factor = src_sz / 8; unsigned chan[4] = { 0 }; for (unsigned i = 0; i < comps; ++i) chan[i] = instr->src[0].swizzle[i] * factor; bi_make_vec_to(b, dst, &idx, chan, comps, 8); return; } case nir_op_b32csel: { if (sz != 16) break; /* We allow vectorizing b32csel(cond, A, B) which can be * translated as MUX.v2i16, even though cond is a 32-bit vector. * * If the source condition vector is replicated, we can use * MUX.v2i16 directly, letting each component use the * corresponding half of the 32-bit source. NIR uses 0/~0 * booleans so that's guaranteed to work (that is, 32-bit NIR * booleans are 16-bit replicated). * * If we're not replicated, we use the same trick but must * insert a MKVEC.v2i16 first to convert down to 16-bit. */ bi_index idx = bi_src_index(&instr->src[0].src); bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]); bi_index s1 = bi_alu_src_index(b, instr->src[1], comps); bi_index s2 = bi_alu_src_index(b, instr->src[2], comps); if (!bi_nir_is_replicated(&instr->src[0])) { s0 = bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false)); } bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); return; } default: break; } bi_index s0 = srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null(); bi_index s1 = srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null(); bi_index s2 = srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null(); switch (instr->op) { case nir_op_ffma: bi_fma_to(b, sz, dst, s0, s1, s2); break; case nir_op_fmul: bi_fma_to(b, sz, dst, s0, s1, bi_negzero()); break; case nir_op_fsub: s1 = bi_neg(s1); FALLTHROUGH; case nir_op_fadd: bi_fadd_to(b, sz, dst, s0, s1); break; case nir_op_fsat: { bi_instr *I = bi_fclamp_to(b, sz, dst, s0); I->clamp = BI_CLAMP_CLAMP_0_1; break; } case nir_op_fsat_signed_mali: { bi_instr *I = bi_fclamp_to(b, sz, dst, s0); I->clamp = BI_CLAMP_CLAMP_M1_1; break; } case nir_op_fclamp_pos_mali: { bi_instr *I = bi_fclamp_to(b, sz, dst, s0); I->clamp = BI_CLAMP_CLAMP_0_INF; break; } case nir_op_fneg: bi_fabsneg_to(b, sz, dst, bi_neg(s0)); break; case nir_op_fabs: bi_fabsneg_to(b, sz, dst, bi_abs(s0)); break; case nir_op_fsin: bi_lower_fsincos_32(b, dst, s0, false); break; case nir_op_fcos: bi_lower_fsincos_32(b, dst, s0, true); break; case nir_op_fexp2: assert(sz == 32); /* should've been lowered */ if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) bi_lower_fexp2_32(b, dst, s0); else bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); break; case nir_op_flog2: assert(sz == 32); /* should've been lowered */ if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) bi_lower_flog2_32(b, dst, s0); else bi_flog2_32(b, dst, s0); break; case nir_op_fpow: assert(sz == 32); /* should've been lowered */ if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) bi_lower_fpow_32(b, dst, s0, s1); else bi_fpow_32(b, dst, s0, s1); break; case nir_op_frexp_exp: bi_frexpe_to(b, sz, dst, s0, false, false); break; case nir_op_frexp_sig: bi_frexpm_to(b, sz, dst, s0, false, false); break; case nir_op_ldexp: bi_ldexp_to(b, sz, dst, s0, s1); break; case nir_op_b8csel: bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); break; case nir_op_b16csel: bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); break; case nir_op_b32csel: bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); break; case nir_op_ishl: bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); break; case nir_op_ushr: bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false); break; case nir_op_ishr: if (b->shader->arch >= 9) bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true); else bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); break; case nir_op_imin: case nir_op_umin: bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0, s1, BI_CMPF_LT); break; case nir_op_imax: case nir_op_umax: bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0, s1, BI_CMPF_GT); break; case nir_op_fddx_must_abs_mali: case nir_op_fddy_must_abs_mali: { bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); bi_index adjacent = bi_clper_xor(b, s0, bit); bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0)); break; } case nir_op_fddx: case nir_op_fddy: case nir_op_fddx_coarse: case nir_op_fddy_coarse: case nir_op_fddx_fine: case nir_op_fddy_fine: { unsigned axis; switch (instr->op) { case nir_op_fddx: case nir_op_fddx_coarse: case nir_op_fddx_fine: axis = 1; break; case nir_op_fddy: case nir_op_fddy_coarse: case nir_op_fddy_fine: axis = 2; break; default: unreachable("Invalid derivative op"); } bi_index lane1, lane2; switch (instr->op) { case nir_op_fddx: case nir_op_fddx_fine: case nir_op_fddy: case nir_op_fddy_fine: lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false), bi_imm_u32(0x3 & ~axis), bi_imm_u8(0)); lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false); break; case nir_op_fddx_coarse: case nir_op_fddy_coarse: lane1 = bi_imm_u32(0); lane2 = bi_imm_u32(axis); break; default: unreachable("Invalid derivative op"); } bi_index left, right; if (b->shader->quirks & BIFROST_LIMITED_CLPER) { left = bi_clper_old_i32(b, s0, lane1); right = bi_clper_old_i32(b, s0, lane2); } else { left = bi_clper_i32(b, s0, lane1, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4); right = bi_clper_i32(b, s0, lane2, BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP4); } bi_fadd_to(b, sz, dst, right, bi_neg(left)); break; } case nir_op_f2f32: bi_f16_to_f32_to(b, dst, s0); break; case nir_op_fquantize2f16: { bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); f16->ftz = f32->ftz = true; break; } case nir_op_f2i32: if (src_sz == 32) bi_f32_to_s32_to(b, dst, s0); else bi_f16_to_s32_to(b, dst, s0); break; /* Note 32-bit sources => no vectorization, so 32-bit works */ case nir_op_f2u16: if (src_sz == 32) bi_f32_to_u32_to(b, dst, s0); else bi_v2f16_to_v2u16_to(b, dst, s0); break; case nir_op_f2i16: if (src_sz == 32) bi_f32_to_s32_to(b, dst, s0); else bi_v2f16_to_v2s16_to(b, dst, s0); break; case nir_op_f2u32: if (src_sz == 32) bi_f32_to_u32_to(b, dst, s0); else bi_f16_to_u32_to(b, dst, s0); break; case nir_op_u2f16: if (src_sz == 32) bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); else if (src_sz == 16) bi_v2u16_to_v2f16_to(b, dst, s0); else if (src_sz == 8) bi_v2u8_to_v2f16_to(b, dst, s0); break; case nir_op_u2f32: if (src_sz == 32) bi_u32_to_f32_to(b, dst, s0); else if (src_sz == 16) bi_u16_to_f32_to(b, dst, s0); else bi_u8_to_f32_to(b, dst, s0); break; case nir_op_i2f16: if (src_sz == 32) bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); else if (src_sz == 16) bi_v2s16_to_v2f16_to(b, dst, s0); else if (src_sz == 8) bi_v2s8_to_v2f16_to(b, dst, s0); break; case nir_op_i2f32: assert(src_sz == 32 || src_sz == 16 || src_sz == 8); if (src_sz == 32) bi_s32_to_f32_to(b, dst, s0); else if (src_sz == 16) bi_s16_to_f32_to(b, dst, s0); else if (src_sz == 8) bi_s8_to_f32_to(b, dst, s0); break; case nir_op_i2i32: assert(src_sz == 32 || src_sz == 16 || src_sz == 8); if (src_sz == 32) bi_mov_i32_to(b, dst, s0); else if (src_sz == 16) bi_s16_to_s32_to(b, dst, s0); else if (src_sz == 8) bi_s8_to_s32_to(b, dst, s0); break; case nir_op_u2u32: assert(src_sz == 32 || src_sz == 16 || src_sz == 8); if (src_sz == 32) bi_mov_i32_to(b, dst, s0); else if (src_sz == 16) bi_u16_to_u32_to(b, dst, s0); else if (src_sz == 8) bi_u8_to_u32_to(b, dst, s0); break; case nir_op_i2i16: assert(src_sz == 8 || src_sz == 32); if (src_sz == 8) bi_v2s8_to_v2s16_to(b, dst, s0); else bi_mov_i32_to(b, dst, s0); break; case nir_op_u2u16: assert(src_sz == 8 || src_sz == 32); if (src_sz == 8) bi_v2u8_to_v2u16_to(b, dst, s0); else bi_mov_i32_to(b, dst, s0); break; case nir_op_b2i8: case nir_op_b2i16: case nir_op_b2i32: bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0, BI_MUX_INT_ZERO); break; case nir_op_f2b16: bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_FP_ZERO); break; case nir_op_f2b32: bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_FP_ZERO); break; case nir_op_i2b8: bi_mux_v4i8_to(b, dst, bi_imm_u8(0), bi_imm_u8(~0), s0, BI_MUX_INT_ZERO); break; case nir_op_i2b16: bi_mux_v2i16_to(b, dst, bi_imm_u16(0), bi_imm_u16(~0), s0, BI_MUX_INT_ZERO); break; case nir_op_i2b32: bi_mux_i32_to(b, dst, bi_imm_u32(0), bi_imm_u32(~0), s0, BI_MUX_INT_ZERO); break; case nir_op_ieq8: case nir_op_ine8: case nir_op_ilt8: case nir_op_ige8: case nir_op_ieq16: case nir_op_ine16: case nir_op_ilt16: case nir_op_ige16: case nir_op_ieq32: case nir_op_ine32: case nir_op_ilt32: case nir_op_ige32: bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); break; case nir_op_ult8: case nir_op_uge8: case nir_op_ult16: case nir_op_uge16: case nir_op_ult32: case nir_op_uge32: bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); break; case nir_op_feq32: case nir_op_feq16: case nir_op_flt32: case nir_op_flt16: case nir_op_fge32: case nir_op_fge16: case nir_op_fneu32: case nir_op_fneu16: bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1); break; case nir_op_fround_even: case nir_op_fceil: case nir_op_ffloor: case nir_op_ftrunc: bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); break; case nir_op_fmin: bi_fmin_to(b, sz, dst, s0, s1); break; case nir_op_fmax: bi_fmax_to(b, sz, dst, s0, s1); break; case nir_op_iadd: bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); break; case nir_op_iadd_sat: bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); break; case nir_op_uadd_sat: bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); break; case nir_op_ihadd: bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); break; case nir_op_irhadd: bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); break; case nir_op_ineg: bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); break; case nir_op_isub: bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); break; case nir_op_isub_sat: bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); break; case nir_op_usub_sat: bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); break; case nir_op_imul: bi_imul_to(b, sz, dst, s0, s1); break; case nir_op_iabs: bi_iabs_to(b, sz, dst, s0); break; case nir_op_iand: bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); break; case nir_op_ior: bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); break; case nir_op_ixor: bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); break; case nir_op_inot: bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); break; case nir_op_frsq: if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) bi_lower_frsq_32(b, dst, s0); else bi_frsq_to(b, sz, dst, s0); break; case nir_op_frcp: if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) bi_lower_frcp_32(b, dst, s0); else bi_frcp_to(b, sz, dst, s0); break; case nir_op_uclz: bi_clz_to(b, sz, dst, s0, false); break; case nir_op_bit_count: bi_popcount_i32_to(b, dst, s0); break; case nir_op_bitfield_reverse: bi_bitrev_i32_to(b, dst, s0); break; case nir_op_ufind_msb: { bi_index clz = bi_clz(b, src_sz, s0, false); if (sz == 8) clz = bi_byte(clz, 0); else if (sz == 16) clz = bi_half(clz, false); bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); break; } default: fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); unreachable("Unknown ALU op"); } } /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ static unsigned bifrost_tex_format(enum glsl_sampler_dim dim) { switch (dim) { case GLSL_SAMPLER_DIM_1D: case GLSL_SAMPLER_DIM_BUF: return 1; case GLSL_SAMPLER_DIM_2D: case GLSL_SAMPLER_DIM_MS: case GLSL_SAMPLER_DIM_EXTERNAL: case GLSL_SAMPLER_DIM_RECT: return 2; case GLSL_SAMPLER_DIM_3D: return 3; case GLSL_SAMPLER_DIM_CUBE: return 0; default: DBG("Unknown sampler dim type\n"); assert(0); return 0; } } static enum bi_dimension valhall_tex_dimension(enum glsl_sampler_dim dim) { switch (dim) { case GLSL_SAMPLER_DIM_1D: case GLSL_SAMPLER_DIM_BUF: return BI_DIMENSION_1D; case GLSL_SAMPLER_DIM_2D: case GLSL_SAMPLER_DIM_MS: case GLSL_SAMPLER_DIM_EXTERNAL: case GLSL_SAMPLER_DIM_RECT: return BI_DIMENSION_2D; case GLSL_SAMPLER_DIM_3D: return BI_DIMENSION_3D; case GLSL_SAMPLER_DIM_CUBE: return BI_DIMENSION_CUBE; default: unreachable("Unknown sampler dim type"); } } static enum bifrost_texture_format_full bi_texture_format(nir_alu_type T, enum bi_clamp clamp) { switch (T) { case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; default: unreachable("Invalid type for texturing"); } } /* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ static bi_index bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) { /* For (u)int we can just passthrough */ nir_alu_type base = nir_alu_type_get_base_type(T); if (base == nir_type_int || base == nir_type_uint) return idx; /* Otherwise we convert */ assert(T == nir_type_float32); /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and * Texel Selection") defines the layer to be taken from clamp(RNE(r), * 0, dt - 1). So we use round RTE, clamping is handled at the data * structure level */ bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx); I->round = BI_ROUND_NONE; return I->dest[0]; } /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a * 16-bit 8:8 fixed-point format. We lower as: * * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF = * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0) */ static bi_index bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) { /* Precompute for constant LODs to avoid general constant folding */ if (lod.type == BI_INDEX_CONSTANT) { uint32_t raw = lod.value; float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; return bi_imm_u32(s32 & 0xFFFF); } /* Sort of arbitrary. Must be less than 128.0, greater than or equal to * the max LOD (16 since we cap at 2^16 texture dimensions), and * preferably small to minimize precision loss */ const float max_lod = 16.0; bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod, bi_imm_f32(1.0f / max_lod), bi_negzero()); fsat->clamp = BI_CLAMP_CLAMP_M1_1; bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero()); return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0)); } /* FETCH takes a 32-bit staging register containing the LOD as an integer in * the bottom 16-bits and (if present) the cube face index in the top 16-bits. * TODO: Cube face. */ static bi_index bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) { return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); } /* The hardware specifies texel offsets and multisample indices together as a * u8vec4 . By default all are zero, so if have either a * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with * the bits we need and return that to be passed as a staging register. Else we * return 0 to avoid allocating a data register when everything is zero. */ static bi_index bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) { bi_index dest = bi_zero(); int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) || nir_src_as_uint(instr->src[offs_idx].src) != 0)) { unsigned nr = nir_src_num_components(instr->src[offs_idx].src); bi_index idx = bi_src_index(&instr->src[offs_idx].src); dest = bi_mkvec_v4i8(b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0), bi_imm_u8(0)); } int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) || nir_src_as_uint(instr->src[ms_idx].src) != 0)) { dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest, bi_imm_u8(24)); } return dest; } /* * Valhall specifies specifies texel offsets, multisample indices, and (for * fetches) LOD together as a u8vec4 , where the third * component is either offset.z or multisample index depending on context. Build * this register. */ static bi_index bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr) { bi_index dest = bi_zero(); int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); /* Components 0-2: offsets */ if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) || nir_src_as_uint(instr->src[offs_idx].src) != 0)) { unsigned nr = nir_src_num_components(instr->src[offs_idx].src); bi_index idx = bi_src_index(&instr->src[offs_idx].src); /* No multisample index with 3D */ assert((nr <= 2) || (ms_idx < 0)); /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ bi_index z = (nr > 2) ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), bi_imm_u8(0), bi_zero()) : bi_zero(); dest = bi_mkvec_v2i8(b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z); } /* Component 2: multisample index */ if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) || nir_src_as_uint(instr->src[ms_idx].src) != 0)) { dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src)); } /* Component 3: 8-bit LOD */ if (lod_idx >= 0 && (!nir_src_is_const(instr->src[lod_idx].src) || nir_src_as_uint(instr->src[lod_idx].src) != 0) && nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest, bi_imm_u8(24)); } return dest; } static void bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s, bi_index *t) { /* Compute max { |x|, |y|, |z| } */ bi_index maxxyz = bi_temp(b->shader); *face = bi_temp(b->shader); bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1), cz = bi_extract(b, coord, 2); /* Use a pseudo op on Bifrost due to tuple restrictions */ if (b->shader->arch <= 8) { bi_cubeface_to(b, maxxyz, *face, cx, cy, cz); } else { bi_cubeface1_to(b, maxxyz, cx, cy, cz); bi_cubeface2_v9_to(b, *face, cx, cy, cz); } /* Select coordinates */ bi_index ssel = bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face); bi_index tsel = bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face); /* The OpenGL ES specification requires us to transform an input vector * (x, y, z) to the coordinate, given the selected S/T: * * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) * * We implement (s shown, t similar) in a form friendlier to FMA * instructions, and clamp coordinates at the end for correct * NaN/infinity handling: * * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) * * Take the reciprocal of max{x, y, z} */ bi_index rcp = bi_frcp_f32(b, maxxyz); /* Calculate 0.5 * (1.0 / max{x, y, z}) */ bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero()); /* Transform the coordinates */ *s = bi_temp(b->shader); *t = bi_temp(b->shader); bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f)); bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f)); S->clamp = BI_CLAMP_CLAMP_0_1; T->clamp = BI_CLAMP_CLAMP_0_1; } /* Emits a cube map descriptor, returning lower 32-bits and putting upper * 32-bits in passed pointer t. The packing of the face with the S coordinate * exploits the redundancy of floating points with the range restriction of * CUBEFACE output. * * struct cube_map_descriptor { * float s : 29; * unsigned face : 3; * float t : 32; * } * * Since the cube face index is preshifted, this is easy to pack with a bitwise * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3 * bits from face. */ static bi_index bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) { bi_index face, s; bi_emit_cube_coord(b, coord, &face, &s, t); bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); } /* Map to the main texture op used. Some of these (txd in particular) will * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in * sequence). We assume that lowering is handled elsewhere. */ static enum bifrost_tex_op bi_tex_op(nir_texop op) { switch (op) { case nir_texop_tex: case nir_texop_txb: case nir_texop_txl: case nir_texop_txd: case nir_texop_tex_prefetch: return BIFROST_TEX_OP_TEX; case nir_texop_txf: case nir_texop_txf_ms: case nir_texop_txf_ms_fb: case nir_texop_tg4: return BIFROST_TEX_OP_FETCH; case nir_texop_txs: case nir_texop_lod: case nir_texop_query_levels: case nir_texop_texture_samples: case nir_texop_samples_identical: unreachable("should've been lowered"); default: unreachable("unsupported tex op"); } } /* Data registers required by texturing in the order they appear. All are * optional, the texture operation descriptor determines which are present. * Note since 3D arrays are not permitted at an API level, Z_COORD and * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ enum bifrost_tex_dreg { BIFROST_TEX_DREG_Z_COORD = 0, BIFROST_TEX_DREG_Y_DELTAS = 1, BIFROST_TEX_DREG_LOD = 2, BIFROST_TEX_DREG_GRDESC_HI = 3, BIFROST_TEX_DREG_SHADOW = 4, BIFROST_TEX_DREG_ARRAY = 5, BIFROST_TEX_DREG_OFFSETMS = 6, BIFROST_TEX_DREG_SAMPLER = 7, BIFROST_TEX_DREG_TEXTURE = 8, BIFROST_TEX_DREG_COUNT, }; static void bi_emit_texc(bi_builder *b, nir_tex_instr *instr) { struct bifrost_texture_operation desc = { .op = bi_tex_op(instr->op), .offset_or_bias_disable = false, /* TODO */ .shadow_or_clamp_disable = instr->is_shadow, .array = instr->is_array, .dimension = bifrost_tex_format(instr->sampler_dim), .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ .mask = 0xF, }; switch (desc.op) { case BIFROST_TEX_OP_TEX: desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; break; case BIFROST_TEX_OP_FETCH: desc.lod_or_fetch = (enum bifrost_lod_mode) (instr->op == nir_texop_tg4 ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : BIFROST_TEXTURE_FETCH_TEXEL); break; default: unreachable("texture op unsupported"); } /* 32-bit indices to be allocated as consecutive staging registers */ bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; bi_index cx = bi_null(), cy = bi_null(); for (unsigned i = 0; i < instr->num_srcs; ++i) { bi_index index = bi_src_index(&instr->src[i].src); unsigned sz = nir_src_bit_size(instr->src[i].src); unsigned components = nir_src_num_components(instr->src[i].src); ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); nir_alu_type T = base | sz; switch (instr->src[i].src_type) { case nir_tex_src_coord: if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { cx = bi_emit_texc_cube_coord(b, index, &cy); } else { /* Copy XY (for 2D+) or XX (for 1D) */ cx = bi_extract(b, index, 0); cy = bi_extract(b, index, MIN2(1, components - 1)); assert(components >= 1 && components <= 3); if (components == 3 && !desc.array) { /* 3D */ dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2); } } if (desc.array) { dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(b, bi_extract(b, index, components - 1), T); } break; case nir_tex_src_lod: if (desc.op == BIFROST_TEX_OP_TEX && nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) { desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; } else if (desc.op == BIFROST_TEX_OP_TEX) { assert(base == nir_type_float); assert(sz == 16 || sz == 32); dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; } else { assert(desc.op == BIFROST_TEX_OP_FETCH); assert(base == nir_type_uint || base == nir_type_int); assert(sz == 16 || sz == 32); dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index); } break; case nir_tex_src_bias: /* Upper 16-bits interpreted as a clamp, leave zero */ assert(desc.op == BIFROST_TEX_OP_TEX); assert(base == nir_type_float); assert(sz == 16 || sz == 32); dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; break; case nir_tex_src_ms_index: case nir_tex_src_offset: if (desc.offset_or_bias_disable) break; dregs[BIFROST_TEX_DREG_OFFSETMS] = bi_emit_texc_offset_ms_index(b, instr); if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) desc.offset_or_bias_disable = true; break; case nir_tex_src_comparator: dregs[BIFROST_TEX_DREG_SHADOW] = index; break; case nir_tex_src_texture_offset: if (instr->texture_index) index = bi_iadd_u32(b, index, bi_imm_u32(instr->texture_index), false); dregs[BIFROST_TEX_DREG_TEXTURE] = index; break; case nir_tex_src_sampler_offset: if (instr->sampler_index) index = bi_iadd_u32(b, index, bi_imm_u32(instr->sampler_index), false); dregs[BIFROST_TEX_DREG_SAMPLER] = index; break; default: unreachable("Unhandled src type in texc emit"); } } if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero()); } /* Choose an index mode */ bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); bool direct = direct_tex && direct_samp; desc.immediate_indices = direct && (instr->sampler_index < 16); if (desc.immediate_indices) { desc.sampler_index_or_mode = instr->sampler_index; desc.index = instr->texture_index; } else { unsigned mode = 0; if (direct && instr->sampler_index == instr->texture_index) { mode = BIFROST_INDEX_IMMEDIATE_SHARED; desc.index = instr->texture_index; } else if (direct) { mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; desc.index = instr->sampler_index; dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, bi_imm_u32(instr->texture_index)); } else if (direct_tex) { assert(!direct_samp); mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; desc.index = instr->texture_index; } else if (direct_samp) { assert(!direct_tex); mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; desc.index = instr->sampler_index; } else { mode = BIFROST_INDEX_REGISTER; } mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2); desc.sampler_index_or_mode = mode; } /* Allocate staging registers contiguously by compacting the array. */ unsigned sr_count = 0; for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { if (!bi_is_null(dregs[i])) dregs[sr_count++] = dregs[i]; } unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; bi_index sr = sr_count ? bi_temp(b->shader) : bi_null(); bi_index dst = bi_temp(b->shader); if (sr_count) bi_emit_collect_to(b, sr, dregs, sr_count); uint32_t desc_u = 0; memcpy(&desc_u, &desc, sizeof(desc_u)); bi_instr *I = bi_texc_to(b, dst, bi_null(), sr, cx, cy, bi_imm_u32(desc_u), !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0); I->register_format = bi_reg_fmt_for_nir(instr->dest_type); bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; bi_emit_split_i32(b, w, dst, res_size); bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); } /* Staging registers required by texturing in the order they appear (Valhall) */ enum valhall_tex_sreg { VALHALL_TEX_SREG_X_COORD = 0, VALHALL_TEX_SREG_Y_COORD = 1, VALHALL_TEX_SREG_Z_COORD = 2, VALHALL_TEX_SREG_Y_DELTAS = 3, VALHALL_TEX_SREG_ARRAY = 4, VALHALL_TEX_SREG_SHADOW = 5, VALHALL_TEX_SREG_OFFSETMS = 6, VALHALL_TEX_SREG_LOD = 7, VALHALL_TEX_SREG_GRDESC = 8, VALHALL_TEX_SREG_COUNT, }; static void bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) { bool explicit_offset = false; enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; bool has_lod_mode = (instr->op == nir_texop_tex) || (instr->op == nir_texop_txl) || (instr->op == nir_texop_txb); /* 32-bit indices to be allocated as consecutive staging registers */ bi_index sregs[VALHALL_TEX_SREG_COUNT] = { }; bi_index sampler = bi_imm_u32(instr->sampler_index); bi_index texture = bi_imm_u32(instr->texture_index); uint32_t tables = (PAN_TABLE_SAMPLER << 11) | (PAN_TABLE_TEXTURE << 27); for (unsigned i = 0; i < instr->num_srcs; ++i) { bi_index index = bi_src_index(&instr->src[i].src); unsigned sz = nir_src_bit_size(instr->src[i].src); unsigned components = nir_src_num_components(instr->src[i].src); switch (instr->src[i].src_type) { case nir_tex_src_coord: if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]); } else { assert(components >= 1 && components <= 3); /* Copy XY (for 2D+) or XX (for 1D) */ sregs[VALHALL_TEX_SREG_X_COORD] = index; if (components >= 2) sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); if (components == 3 && !instr->is_array) { sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2); } } if (instr->is_array) { sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components - 1); } break; case nir_tex_src_lod: if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) { lod_mode = BI_VA_LOD_MODE_ZERO_LOD; } else if (has_lod_mode) { lod_mode = BI_VA_LOD_MODE_EXPLICIT; assert(sz == 16 || sz == 32); sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); } break; case nir_tex_src_bias: /* Upper 16-bits interpreted as a clamp, leave zero */ assert(sz == 16 || sz == 32); sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; break; case nir_tex_src_ms_index: case nir_tex_src_offset: /* Handled below */ break; case nir_tex_src_comparator: sregs[VALHALL_TEX_SREG_SHADOW] = index; break; case nir_tex_src_texture_offset: assert(instr->texture_index == 0); texture = index; break; case nir_tex_src_sampler_offset: assert(instr->sampler_index == 0); sampler = index; break; default: unreachable("Unhandled src type in tex emit"); } } /* Generate packed offset + ms index + LOD register. These default to * zero so we only need to encode if these features are actually in use. */ bi_index offsets = bi_emit_valhall_offsets(b, instr); if (!bi_is_equiv(offsets, bi_zero())) { sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; explicit_offset = true; } /* Allocate staging registers contiguously by compacting the array. */ unsigned sr_count = 0; for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { if (!bi_is_null(sregs[i])) sregs[sr_count++] = sregs[i]; } bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); if (sr_count) bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); bi_index image_src = bi_imm_u32(tables); image_src = bi_lshift_or_i32(b, sampler, image_src, bi_imm_u8(0)); image_src = bi_lshift_or_i32(b, texture, image_src, bi_imm_u8(16)); unsigned mask = BI_WRITE_MASK_RGBA; unsigned res_size = nir_dest_bit_size(instr->dest) == 16 ? 2 : 4; enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); bi_index dest = bi_temp(b->shader); switch (instr->op) { case nir_texop_tex: case nir_texop_txl: case nir_texop_txb: bi_tex_single_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, regfmt, instr->is_shadow, explicit_offset, lod_mode, mask, sr_count); break; case nir_texop_txf: case nir_texop_txf_ms: bi_tex_fetch_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, regfmt, explicit_offset, mask, sr_count); break; case nir_texop_tg4: bi_tex_gather_to(b, dest, idx, image_src, bi_zero(), instr->is_array, dim, instr->component, false, regfmt, instr->is_shadow, explicit_offset, mask, sr_count); break; default: unreachable("Unhandled Valhall texture op"); } bi_index w[4] = { bi_null(), bi_null(), bi_null(), bi_null() }; bi_emit_split_i32(b, w, dest, res_size); bi_emit_collect_to(b, bi_dest_index(&instr->dest), w, DIV_ROUND_UP(nir_dest_num_components(instr->dest) * res_size, 4)); } /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube * textures with sufficiently small immediate indices. Anything else * needs a complete texture op. */ static void bi_emit_texs(bi_builder *b, nir_tex_instr *instr) { int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); assert(coord_idx >= 0); bi_index coords = bi_src_index(&instr->src[coord_idx].src); if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { bi_index face, s, t; bi_emit_cube_coord(b, coords, &face, &s, &t); bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), bi_dest_index(&instr->dest), s, t, face, instr->sampler_index, instr->texture_index); } else { bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), bi_dest_index(&instr->dest), bi_extract(b, coords, 0), bi_extract(b, coords, 1), instr->op != nir_texop_tex, /* zero LOD */ instr->sampler_index, instr->texture_index); } bi_split_dest(b, instr->dest); } static bool bi_is_simple_tex(nir_tex_instr *instr) { if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) return false; if (instr->dest_type != nir_type_float32 && instr->dest_type != nir_type_float16) return false; if (instr->is_shadow || instr->is_array) return false; switch (instr->sampler_dim) { case GLSL_SAMPLER_DIM_2D: case GLSL_SAMPLER_DIM_EXTERNAL: case GLSL_SAMPLER_DIM_RECT: break; case GLSL_SAMPLER_DIM_CUBE: /* LOD can't be specified with TEXS_CUBE */ if (instr->op == nir_texop_txl) return false; break; default: return false; } for (unsigned i = 0; i < instr->num_srcs; ++i) { if (instr->src[i].src_type != nir_tex_src_lod && instr->src[i].src_type != nir_tex_src_coord) return false; } /* Indices need to fit in provided bits */ unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) return false; int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); if (lod_idx < 0) return true; nir_src lod = instr->src[lod_idx].src; return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; } static void bi_emit_tex(bi_builder *b, nir_tex_instr *instr) { switch (instr->op) { case nir_texop_txs: bi_load_sysval_to(b, bi_dest_index(&instr->dest), panfrost_sysval_for_instr(&instr->instr, NULL), nir_dest_num_components(instr->dest), 0); return; case nir_texop_tex: case nir_texop_txl: case nir_texop_txb: case nir_texop_txf: case nir_texop_txf_ms: case nir_texop_tg4: break; default: unreachable("Invalid texture operation"); } if (b->shader->arch >= 9) bi_emit_tex_valhall(b, instr); else if (bi_is_simple_tex(instr)) bi_emit_texs(b, instr); else bi_emit_texc(b, instr); } static void bi_emit_instr(bi_builder *b, struct nir_instr *instr) { switch (instr->type) { case nir_instr_type_load_const: bi_emit_load_const(b, nir_instr_as_load_const(instr)); break; case nir_instr_type_intrinsic: bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_alu: bi_emit_alu(b, nir_instr_as_alu(instr)); break; case nir_instr_type_tex: bi_emit_tex(b, nir_instr_as_tex(instr)); break; case nir_instr_type_jump: bi_emit_jump(b, nir_instr_as_jump(instr)); break; default: unreachable("should've been lowered"); } } static bi_block * create_empty_block(bi_context *ctx) { bi_block *blk = rzalloc(ctx, bi_block); util_dynarray_init(&blk->predecessors, blk); return blk; } static bi_block * emit_block(bi_context *ctx, nir_block *block) { if (ctx->after_block) { ctx->current_block = ctx->after_block; ctx->after_block = NULL; } else { ctx->current_block = create_empty_block(ctx); } list_addtail(&ctx->current_block->link, &ctx->blocks); list_inithead(&ctx->current_block->instructions); bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); nir_foreach_instr(instr, block) { bi_emit_instr(&_b, instr); ++ctx->instruction_count; } return ctx->current_block; } static void emit_if(bi_context *ctx, nir_if *nif) { bi_block *before_block = ctx->current_block; /* Speculatively emit the branch, but we can't fill it in until later */ bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); bi_instr *then_branch = bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false), bi_zero(), BI_CMPF_EQ); /* Emit the two subblocks. */ bi_block *then_block = emit_cf_list(ctx, &nif->then_list); bi_block *end_then_block = ctx->current_block; /* Emit second block, and check if it's empty */ int count_in = ctx->instruction_count; bi_block *else_block = emit_cf_list(ctx, &nif->else_list); bi_block *end_else_block = ctx->current_block; ctx->after_block = create_empty_block(ctx); /* Now that we have the subblocks emitted, fix up the branches */ assert(then_block); assert(else_block); if (ctx->instruction_count == count_in) { then_branch->branch_target = ctx->after_block; bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */ } else { then_branch->branch_target = else_block; /* Emit a jump from the end of the then block to the end of the else */ _b.cursor = bi_after_block(end_then_block); bi_instr *then_exit = bi_jump(&_b, bi_zero()); then_exit->branch_target = ctx->after_block; bi_block_add_successor(end_then_block, then_exit->branch_target); bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ } bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ bi_block_add_successor(before_block, then_block); /* fallthrough */ } static void emit_loop(bi_context *ctx, nir_loop *nloop) { /* Remember where we are */ bi_block *start_block = ctx->current_block; bi_block *saved_break = ctx->break_block; bi_block *saved_continue = ctx->continue_block; ctx->continue_block = create_empty_block(ctx); ctx->break_block = create_empty_block(ctx); ctx->after_block = ctx->continue_block; /* Emit the body itself */ emit_cf_list(ctx, &nloop->body); /* Branch back to loop back */ bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); bi_instr *I = bi_jump(&_b, bi_zero()); I->branch_target = ctx->continue_block; bi_block_add_successor(start_block, ctx->continue_block); bi_block_add_successor(ctx->current_block, ctx->continue_block); ctx->after_block = ctx->break_block; /* Pop off */ ctx->break_block = saved_break; ctx->continue_block = saved_continue; ++ctx->loop_count; } static bi_block * emit_cf_list(bi_context *ctx, struct exec_list *list) { bi_block *start_block = NULL; foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: { bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); if (!start_block) start_block = block; break; } case nir_cf_node_if: emit_if(ctx, nir_cf_node_as_if(node)); break; case nir_cf_node_loop: emit_loop(ctx, nir_cf_node_as_loop(node)); break; default: unreachable("Unknown control flow"); } } return start_block; } /* shader-db stuff */ struct bi_stats { unsigned nr_clauses, nr_tuples, nr_ins; unsigned nr_arith, nr_texture, nr_varying, nr_ldst; }; static void bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) { /* Count instructions */ stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); /* Non-message passing tuples are always arithmetic */ if (tuple->add != clause->message) { stats->nr_arith++; return; } /* Message + FMA we'll count as arithmetic _and_ message */ if (tuple->fma) stats->nr_arith++; switch (clause->message_type) { case BIFROST_MESSAGE_VARYING: /* Check components interpolated */ stats->nr_varying += (clause->message->vecsize + 1) * (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); break; case BIFROST_MESSAGE_VARTEX: /* 2 coordinates, fp32 each */ stats->nr_varying += (2 * 2); FALLTHROUGH; case BIFROST_MESSAGE_TEX: stats->nr_texture++; break; case BIFROST_MESSAGE_ATTRIBUTE: case BIFROST_MESSAGE_LOAD: case BIFROST_MESSAGE_STORE: case BIFROST_MESSAGE_ATOMIC: stats->nr_ldst++; break; case BIFROST_MESSAGE_NONE: case BIFROST_MESSAGE_BARRIER: case BIFROST_MESSAGE_BLEND: case BIFROST_MESSAGE_TILE: case BIFROST_MESSAGE_Z_STENCIL: case BIFROST_MESSAGE_ATEST: case BIFROST_MESSAGE_JOB: case BIFROST_MESSAGE_64BIT: /* Nothing to do */ break; }; } /* * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the * shader completes. These costs are not accounted for in the general cycle * counts, so this function calculates the effective cost of these messages, as * if they were executed by shader code. */ static unsigned bi_count_preload_cost(bi_context *ctx) { /* Units: 1/16 of a normalized cycle, assuming that we may interpolate * 16 fp16 varying components per cycle or fetch two texels per cycle. */ unsigned cost = 0; for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) { struct bifrost_message_preload msg = ctx->info.bifrost->messages[i]; if (msg.enabled && msg.texture) { /* 2 coordinate, 2 half-words each, plus texture */ cost += 12; } else if (msg.enabled) { cost += (msg.num_components * (msg.fp16 ? 1 : 2)); } } return cost; } static const char * bi_shader_stage_name(bi_context *ctx) { if (ctx->idvs == BI_IDVS_VARYING) return "MESA_SHADER_VARYING"; else if (ctx->idvs == BI_IDVS_POSITION) return "MESA_SHADER_POSITION"; else if (ctx->inputs->is_blend) return "MESA_SHADER_BLEND"; else return gl_shader_stage_name(ctx->stage); } static void bi_print_stats(bi_context *ctx, unsigned size, FILE *fp) { struct bi_stats stats = { 0 }; /* Count instructions, clauses, and tuples. Also attempt to construct * normalized execution engine cycle counts, using the following ratio: * * 24 arith tuples/cycle * 2 texture messages/cycle * 16 x 16-bit varying channels interpolated/cycle * 1 load store message/cycle * * These numbers seem to match Arm Mobile Studio's heuristic. The real * cycle counts are surely more complicated. */ bi_foreach_block(ctx, block) { bi_foreach_clause_in_block(block, clause) { stats.nr_clauses++; stats.nr_tuples += clause->tuple_count; for (unsigned i = 0; i < clause->tuple_count; ++i) bi_count_tuple_stats(clause, &clause->tuples[i], &stats); } } float cycles_arith = ((float) stats.nr_arith) / 24.0; float cycles_texture = ((float) stats.nr_texture) / 2.0; float cycles_varying = ((float) stats.nr_varying) / 16.0; float cycles_ldst = ((float) stats.nr_ldst) / 1.0; float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); float cycles_bound = MAX2(cycles_arith, cycles_message); /* Thread count and register pressure are traded off only on v7 */ bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); unsigned nr_threads = full_threads ? 2 : 1; /* Dump stats */ char *str = ralloc_asprintf(NULL, "%s - %s shader: " "%u inst, %u tuples, %u clauses, " "%f cycles, %f arith, %f texture, %f vary, %f ldst, " "%u quadwords, %u threads", ctx->nir->info.label ?: "", bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples, stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture, cycles_varying, cycles_ldst, size / 16, nr_threads); if (ctx->arch == 7) { ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); } ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills\n", ctx->loop_count, ctx->spills, ctx->fills); fputs(str, stderr); ralloc_free(str); } static void va_print_stats(bi_context *ctx, unsigned size, FILE *fp) { unsigned nr_ins = 0; struct va_stats stats = { 0 }; /* Count instructions */ bi_foreach_instr_global(ctx, I) { nr_ins++; va_count_instr_stats(I, &stats); } /* Mali G78 peak performance: * * 64 FMA instructions per cycle * 64 CVT instructions per cycle * 16 SFU instructions per cycle * 8 x 32-bit varying channels interpolated per cycle * 4 texture instructions per cycle * 1 load/store operation per cycle */ float cycles_fma = ((float) stats.fma) / 64.0; float cycles_cvt = ((float) stats.cvt) / 64.0; float cycles_sfu = ((float) stats.sfu) / 16.0; float cycles_v = ((float) stats.v) / 16.0; float cycles_t = ((float) stats.t) / 4.0; float cycles_ls = ((float) stats.ls) / 1.0; /* Calculate the bound */ float cycles = MAX2( MAX3(cycles_fma, cycles_cvt, cycles_sfu), MAX3(cycles_v, cycles_t, cycles_ls)); /* Thread count and register pressure are traded off */ unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; /* Dump stats */ fprintf(stderr, "%s - %s shader: " "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " "%f t, %f ls, %u quadwords, %u threads, %u loops, " "%u:%u spills:fills\n", ctx->nir->info.label ?: "", bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma, cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls, size / 16, nr_threads, ctx->loop_count, ctx->spills, ctx->fills); } static int glsl_type_size(const struct glsl_type *type, bool bindless) { return glsl_count_attribute_slots(type, false); } /* Split stores to memory. We don't split stores to vertex outputs, since * nir_lower_io_to_temporaries will ensure there's only a single write. */ static bool should_split_wrmask(const nir_instr *instr, UNUSED const void *data) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: case nir_intrinsic_store_global: case nir_intrinsic_store_scratch: return true; default: return false; } } /* Bifrost wants transcendentals as FP32 */ static unsigned bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_alu) return 0; nir_alu_instr *alu = nir_instr_as_alu(instr); switch (alu->op) { case nir_op_fexp2: case nir_op_flog2: case nir_op_fpow: case nir_op_fsin: case nir_op_fcos: return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32; default: return 0; } } /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, * transcendentals are an exception. Also shifts because of lane size mismatch * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need * to be scalarized due to type size. */ static uint8_t bi_vectorize_filter(const nir_instr *instr, const void *data) { /* Defaults work for everything else */ if (instr->type != nir_instr_type_alu) return 0; const nir_alu_instr *alu = nir_instr_as_alu(instr); switch (alu->op) { case nir_op_frcp: case nir_op_frsq: case nir_op_ishl: case nir_op_ishr: case nir_op_ushr: case nir_op_f2i16: case nir_op_f2u16: return 1; default: break; } /* Vectorized instructions cannot write more than 32-bit */ int dst_bit_size = nir_dest_bit_size(alu->dest.dest); if (dst_bit_size == 16) return 2; else return 1; } static bool bi_scalarize_filter(const nir_instr *instr, const void *data) { if (instr->type != nir_instr_type_alu) return false; const nir_alu_instr *alu = nir_instr_as_alu(instr); switch (alu->op) { case nir_op_pack_uvec2_to_uint: case nir_op_pack_uvec4_to_uint: return false; default: return true; } } /* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we * keep divergence info around after we consume it for indirect lowering, * nir_convert_from_ssa will regress code quality since it will avoid * coalescing divergent with non-divergent nodes. */ static bool nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data) { ssa->divergent = false; return true; } static bool nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr, UNUSED void *data) { return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL); } /* Ensure we write exactly 4 components */ static nir_ssa_def * bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, unsigned channel, unsigned first, unsigned mask) { if (!(mask & BITFIELD_BIT(channel))) channel = first; return nir_channel(b, in, channel); } /* Lower fragment store_output instructions to always write 4 components, * matching the hardware semantic. This may require additional moves. Skipping * these moves is possible in theory, but invokes undefined behaviour in the * compiler. The DDK inserts these moves, so we will as well. */ static bool bifrost_nir_lower_blend_components(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) return false; nir_ssa_def *in = intr->src[0].ssa; unsigned first = nir_intrinsic_component(intr); unsigned mask = nir_intrinsic_write_mask(intr); assert(first == 0 && "shouldn't get nonzero components"); /* Nothing to do */ if (mask == BITFIELD_MASK(4)) return false; b->cursor = nir_before_instr(&intr->instr); /* Replicate the first valid component instead */ nir_ssa_def *replicated = nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), bifrost_nir_valid_channel(b, in, 1, first, mask), bifrost_nir_valid_channel(b, in, 2, first, mask), bifrost_nir_valid_channel(b, in, 3, first, mask)); /* Rewrite to use our replicated version */ nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); nir_intrinsic_set_component(intr, 0); nir_intrinsic_set_write_mask(intr, 0xF); intr->num_components = 4; return true; } static void bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) { bool progress; unsigned lower_flrp = 16 | 32 | 64; NIR_PASS(progress, nir, nir_lower_regs_to_ssa); nir_lower_tex_options lower_tex_options = { .lower_txs_lod = true, .lower_txp = ~0, .lower_tg4_broadcom_swizzle = true, .lower_txd = true, .lower_invalid_implicit_lod = true, }; NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); NIR_PASS(progress, nir, pan_lower_helper_invocation); NIR_PASS(progress, nir, nir_lower_int64); nir_lower_idiv_options idiv_options = { .imprecise_32bit_lowering = true, .allow_fp16 = true, }; NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); do { progress = false; NIR_PASS(progress, nir, nir_lower_var_copies); NIR_PASS(progress, nir, nir_lower_vars_to_ssa); NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_remove_phis); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_dead_cf); NIR_PASS(progress, nir, nir_opt_cse); NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); NIR_PASS(progress, nir, nir_lower_alu); if (lower_flrp != 0) { bool lower_flrp_progress = false; NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */); if (lower_flrp_progress) { NIR_PASS(progress, nir, nir_opt_constant_folding); progress = true; } /* Nothing should rematerialize any flrps, so we only * need to do this lowering once. */ lower_flrp = 0; } NIR_PASS(progress, nir, nir_opt_undef); NIR_PASS(progress, nir, nir_lower_undef_to_zero); NIR_PASS(progress, nir, nir_opt_shrink_vectors); NIR_PASS(progress, nir, nir_opt_loop_unroll); } while (progress); /* TODO: Why is 64-bit getting rematerialized? * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ NIR_PASS(progress, nir, nir_lower_int64); /* We need to cleanup after each iteration of late algebraic * optimizations, since otherwise NIR can produce weird edge cases * (like fneg of a constant) which we don't handle */ bool late_algebraic = true; while (late_algebraic) { late_algebraic = false; NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); NIR_PASS(progress, nir, nir_opt_constant_folding); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_cse); } NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL); NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); NIR_PASS(progress, nir, nir_lower_bool_to_bitsize); /* Prepass to simplify instruction selection */ late_algebraic = false; NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late); while (late_algebraic) { late_algebraic = false; NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); NIR_PASS(progress, nir, nir_opt_constant_folding); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_cse); } NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); NIR_PASS(progress, nir, nir_opt_dce); if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_lower_blend_components, nir_metadata_block_index | nir_metadata_dominance, NULL); } /* Backend scheduler is purely local, so do some global optimizations * to reduce register pressure. */ nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; NIR_PASS_V(nir, nir_opt_sink, move_all); NIR_PASS_V(nir, nir_opt_move, move_all); /* We might lower attribute, varying, and image indirects. Use the * gathered info to skip the extra analysis in the happy path. */ bool any_indirects = nir->info.inputs_read_indirectly || nir->info.outputs_accessed_indirectly || nir->info.patch_inputs_read_indirectly || nir->info.patch_outputs_accessed_indirectly || nir->info.images_used[0]; if (any_indirects) { nir_convert_to_lcssa(nir, true, true); NIR_PASS_V(nir, nir_divergence_analysis); NIR_PASS_V(nir, bi_lower_divergent_indirects, pan_subgroup_size(gpu_id >> 12)); NIR_PASS_V(nir, nir_shader_instructions_pass, nir_invalidate_divergence, nir_metadata_all, NULL); } } /* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the * same lowering here to zero-extend correctly */ static bool bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { if (nir_src_bit_size(intr->src[0]) != 8) return false; nir_alu_type type = nir_alu_type_get_base_type(nir_intrinsic_src_type(intr)); assert(type == nir_type_int || type == nir_type_uint); b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16); nir_intrinsic_set_src_type(intr, type | 16); nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast); return true; } static bool bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { if (nir_dest_bit_size(intr->dest) != 8) return false; nir_alu_type type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr)); assert(type == nir_type_int || type == nir_type_uint); b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *out = nir_load_output(b, intr->num_components, 16, intr->src[0].ssa, .base = nir_intrinsic_base(intr), .component = nir_intrinsic_component(intr), .dest_type = type | 16, .io_semantics = nir_intrinsic_io_semantics(intr)); nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8); nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast); return true; } static bool bifrost_nir_lower_i8_frag(struct nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_load_output) return bifrost_nir_lower_i8_fragin_impl(b, intr, data); else if (intr->intrinsic == nir_intrinsic_store_output) return bifrost_nir_lower_i8_fragout_impl(b, intr, data); else return false; } static void bi_opt_post_ra(bi_context *ctx) { bi_foreach_instr_global_safe(ctx, ins) { if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) bi_remove_instruction(ins); } } /* If the shader packs multiple varyings into the same location with different * location_frac, we'll need to lower to a single varying store that collects * all of the channels together. */ static bool bifrost_nir_lower_store_component(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) return false; struct hash_table_u64 *slots = data; unsigned component = nir_intrinsic_component(intr); nir_src *slot_src = nir_get_io_offset_src(intr); uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); nir_ssa_def *value = intr->src[0].ssa; b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); nir_ssa_def *channels[4] = { undef, undef, undef, undef }; /* Copy old */ u_foreach_bit(i, mask) { assert(prev != NULL); nir_ssa_def *prev_ssa = prev->src[0].ssa; channels[i] = nir_channel(b, prev_ssa, i); } /* Copy new */ unsigned new_mask = nir_intrinsic_write_mask(intr); mask |= (new_mask << component); u_foreach_bit(i, new_mask) { assert(component + i < 4); channels[component + i] = nir_channel(b, value, i); } intr->num_components = util_last_bit(mask); nir_instr_rewrite_src_ssa(instr, &intr->src[0], nir_vec(b, channels, intr->num_components)); nir_intrinsic_set_component(intr, 0); nir_intrinsic_set_write_mask(intr, mask); if (prev) { _mesa_hash_table_u64_remove(slots, slot); nir_instr_remove(&prev->instr); } _mesa_hash_table_u64_insert(slots, slot, intr); return false; } /* Dead code elimination for branches at the end of a block - only one branch * per block is legal semantically, but unreachable jumps can be generated. * Likewise on Bifrost we can generate jumps to the terminal block which need * to be lowered away to a jump to #0x0, which induces successful termination. * That trick doesn't work on Valhall, which needs a NOP inserted in the * terminal block instead. */ static void bi_lower_branch(bi_context *ctx, bi_block *block) { bool cull_terminal = (ctx->arch <= 8); bool branched = false; ASSERTED bool was_jump = false; bi_foreach_instr_in_block_safe(block, ins) { if (!ins->branch_target) continue; if (branched) { assert(was_jump && (ins->op == BI_OPCODE_JUMP)); bi_remove_instruction(ins); continue; } branched = true; was_jump = ins->op == BI_OPCODE_JUMP; if (!bi_is_terminal_block(ins->branch_target)) continue; if (cull_terminal) ins->branch_target = NULL; else if (ins->branch_target) ins->branch_target->needs_nop = true; } } static void bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset) { unsigned final_clause = bi_pack(ctx, binary); /* If we need to wait for ATEST or BLEND in the first clause, pass the * corresponding bits through to the renderer state descriptor */ bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); unsigned first_deps = first_clause ? first_clause->dependencies : 0; ctx->info.bifrost->wait_6 = (first_deps & (1 << 6)); ctx->info.bifrost->wait_7 = (first_deps & (1 << 7)); /* Pad the shader with enough zero bytes to trick the prefetcher, * unless we're compiling an empty shader (in which case we don't pad * so the size remains 0) */ unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; if (binary->size - offset) { memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0, prefetch_size); } } /* * Build a bit mask of varyings (by location) that are flatshaded. This * information is needed by lower_mediump_io, as we don't yet support 16-bit * flat varyings. * * Also varyings that are used as texture coordinates should be kept at fp32 so * the texture instruction may be promoted to VAR_TEX. In general this is a good * idea, as fp16 texture coordinates are not supported by the hardware and are * usually inappropriate. (There are both relevant CTS bugs here, even.) * * TODO: If we compacted the varyings with some fixup code in the vertex shader, * we could implement 16-bit flat varyings. Consider if this case matters. * * TODO: The texture coordinate handling could be less heavyhanded. */ static bool bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data) { uint64_t *mask = data; if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); if (coord_idx < 0) return false; nir_src src = tex->src[coord_idx].src; assert(src.is_ssa); nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0); nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1); if (x.def != y.def) return false; nir_instr *parent = x.def->parent_instr; if (parent->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent); if (intr->intrinsic != nir_intrinsic_load_interpolated_input) return false; nir_io_semantics sem = nir_intrinsic_io_semantics(intr); *mask |= BITFIELD64_BIT(sem.location); return false; } static uint64_t bi_fp32_varying_mask(nir_shader *nir) { uint64_t mask = 0; assert(nir->info.stage == MESA_SHADER_FRAGMENT); nir_foreach_shader_in_variable(var, nir) { if (var->data.interpolation == INTERP_MODE_FLAT) mask |= BITFIELD64_BIT(var->data.location); } nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask); return mask; } static void bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) { /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has * messed with our I/O quite a bit already) */ NIR_PASS_V(nir, nir_lower_vars_to_ssa); if (nir->info.stage == MESA_SHADER_VERTEX) { NIR_PASS_V(nir, nir_lower_viewport_transform); NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0); nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ); if (psiz != NULL) psiz->data.precision = GLSL_PRECISION_MEDIUM; } /* Get rid of any global vars before we lower to scratch. */ NIR_PASS_V(nir, nir_lower_global_vars_to_local); /* Valhall introduces packed thread local storage, which improves cache * locality of TLS access. However, access to packed TLS cannot * straddle 16-byte boundaries. As such, when packed TLS is in use * (currently unconditional for Valhall), we force vec4 alignment for * scratch access. */ bool packed_tls = (gpu_id >= 0x9000); /* Lower large arrays to scratch and small arrays to bcsel */ NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, packed_tls ? glsl_get_vec4_size_align_bytes : glsl_get_natural_size_align_bytes); NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); NIR_PASS_V(nir, nir_split_var_copies); NIR_PASS_V(nir, nir_lower_var_copies); NIR_PASS_V(nir, nir_lower_vars_to_ssa); NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, glsl_type_size, 0); /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for * offsets it could figure out are constant. Do some constant folding * before bifrost_nir_lower_store_component below. */ NIR_PASS_V(nir, nir_opt_constant_folding); if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_in | nir_var_shader_out, ~bi_fp32_varying_mask(nir), false); } else { if (gpu_id >= 0x9000) { NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); } struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_lower_store_component, nir_metadata_block_index | nir_metadata_dominance, stores); _mesa_hash_table_u64_destroy(stores); } NIR_PASS_V(nir, nir_lower_ssbo); NIR_PASS_V(nir, pan_nir_lower_zs_store); NIR_PASS_V(nir, pan_lower_sample_pos); NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_lower_i8_frag, nir_metadata_block_index | nir_metadata_dominance, NULL); } if (nir->xfb_info != NULL && nir->info.has_transform_feedback_varyings) { NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out); NIR_PASS_V(nir, nir_io_add_intrinsic_xfb_info); NIR_PASS_V(nir, pan_lower_xfb); } bi_optimize_nir(nir, gpu_id, is_blend); } static bi_context * bi_compile_variant_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct hash_table_u64 *sysval_to_id, struct bi_shader_info info, enum bi_idvs_mode idvs) { bi_context *ctx = rzalloc(NULL, bi_context); /* There may be another program in the dynarray, start at the end */ unsigned offset = binary->size; ctx->sysval_to_id = sysval_to_id; ctx->inputs = inputs; ctx->nir = nir; ctx->stage = nir->info.stage; ctx->quirks = bifrost_get_quirks(inputs->gpu_id); ctx->arch = inputs->gpu_id >> 12; ctx->info = info; ctx->idvs = idvs; ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs; if (idvs != BI_IDVS_NONE) { /* Specializing shaders for IDVS is destructive, so we need to * clone. However, the last (second) IDVS shader does not need * to be preserved so we can skip cloning that one. */ if (offset == 0) ctx->nir = nir = nir_shader_clone(ctx, nir); NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs, nir_metadata_block_index | nir_metadata_dominance, &idvs); /* After specializing, clean up the mess */ bool progress = true; while (progress) { progress = false; NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_dead_cf); } } /* We can only go out-of-SSA after speciailizing IDVS, as opt_dead_cf * doesn't know how to deal with nir_register. */ NIR_PASS_V(nir, nir_convert_from_ssa, true); /* If nothing is pushed, all UBOs need to be uploaded */ ctx->ubo_mask = ~0; list_inithead(&ctx->blocks); bool skip_internal = nir->info.internal; skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { nir_print_shader(nir, stdout); } ctx->allocated_vec = _mesa_hash_table_u64_create(ctx); nir_foreach_function(func, nir) { if (!func->impl) continue; ctx->ssa_alloc += func->impl->ssa_alloc; ctx->reg_alloc += func->impl->reg_alloc; emit_cf_list(ctx, &func->impl->body); break; /* TODO: Multi-function shaders */ } /* Index blocks now that we're done emitting */ bi_foreach_block(ctx, block) { block->index = ctx->num_blocks++; } bi_validate(ctx, "NIR -> BIR"); /* If the shader doesn't write any colour or depth outputs, it may * still need an ATEST at the very end! */ bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) && !ctx->emitted_atest && !bi_skip_atest(ctx, false); if (need_dummy_atest) { bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); bi_builder b = bi_init_builder(ctx, bi_after_block(end)); bi_emit_atest(&b, bi_zero()); } bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); /* Runs before constant folding */ bi_lower_swizzle(ctx); bi_validate(ctx, "Early lowering"); /* Runs before copy prop */ if (optimize && !ctx->inputs->no_ubo_to_push) { bi_opt_push_ubo(ctx); } if (likely(optimize)) { bi_opt_copy_prop(ctx); while (bi_opt_constant_fold(ctx)) bi_opt_copy_prop(ctx); bi_opt_mod_prop_forward(ctx); bi_opt_mod_prop_backward(ctx); /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after * mod_prop_backward to fuse VAR_TEX */ if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT && !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) { bi_opt_dead_code_eliminate(ctx); bi_opt_message_preload(ctx); bi_opt_copy_prop(ctx); } bi_opt_dead_code_eliminate(ctx); bi_opt_cse(ctx); bi_opt_dead_code_eliminate(ctx); if (!ctx->inputs->no_ubo_to_push) bi_opt_reorder_push(ctx); bi_validate(ctx, "Optimization passes"); } bi_foreach_instr_global(ctx, I) { bi_lower_opt_instruction(I); } if (ctx->arch >= 9) { va_optimize(ctx); bi_foreach_instr_global_safe(ctx, I) { va_lower_isel(I); va_lower_constants(ctx, I); bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); va_repair_fau(&b, I); } /* We need to clean up after constant lowering */ if (likely(optimize)) { bi_opt_cse(ctx); bi_opt_dead_code_eliminate(ctx); } bi_validate(ctx, "Valhall passes"); } bi_foreach_block(ctx, block) { bi_lower_branch(ctx, block); } if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) bi_print_shader(ctx, stdout); /* Analyze before register allocation to avoid false dependencies. The * skip bit is a function of only the data flow graph and is invariant * under valid scheduling. Helpers are only defined for fragment * shaders, so this analysis is only required in fragment shaders. */ if (ctx->stage == MESA_SHADER_FRAGMENT) bi_analyze_helper_requirements(ctx); /* Fuse TEXC after analyzing helper requirements so the analysis * doesn't have to know about dual textures */ if (likely(optimize)) { bi_opt_fuse_dual_texture(ctx); } /* Lower FAU after fusing dual texture, because fusing dual texture * creates new immediates that themselves may need lowering. */ if (ctx->arch <= 8) { bi_lower_fau(ctx); } /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */ if (likely(optimize)) { bi_opt_cse(ctx); bi_opt_dead_code_eliminate(ctx); } if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) bi_pressure_schedule(ctx); bi_validate(ctx, "Late lowering"); bi_register_allocate(ctx); if (likely(optimize)) bi_opt_post_ra(ctx); if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) bi_print_shader(ctx, stdout); if (ctx->arch >= 9) { va_assign_slots(ctx); va_insert_flow_control_nops(ctx); va_merge_flow(ctx); va_mark_last(ctx); } else { bi_schedule(ctx); bi_assign_scoreboard(ctx); /* Analyze after scheduling since we depend on instruction * order. Valhall calls as part of va_insert_flow_control_nops, * as the handling for clauses differs from instructions. */ bi_analyze_helper_terminate(ctx); bi_mark_clauses_td(ctx); } if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) bi_print_shader(ctx, stdout); if (ctx->arch <= 8) { bi_pack_clauses(ctx, binary, offset); } else { bi_pack_valhall(ctx, binary); } if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { if (ctx->arch <= 8) { disassemble_bifrost(stdout, binary->data + offset, binary->size - offset, bifrost_debug & BIFROST_DBG_VERBOSE); } else { disassemble_valhall(stdout, binary->data + offset, binary->size - offset, bifrost_debug & BIFROST_DBG_VERBOSE); } fflush(stdout); } if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && !skip_internal) { if (ctx->arch >= 9) { va_print_stats(ctx, binary->size - offset, stderr); } else { bi_print_stats(ctx, binary->size - offset, stderr); } } return ctx; } static void bi_compile_variant(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct hash_table_u64 *sysval_to_id, struct pan_shader_info *info, enum bi_idvs_mode idvs) { struct bi_shader_info local_info = { .push = &info->push, .bifrost = &info->bifrost, .tls_size = info->tls_size, .sysvals = &info->sysvals, .push_offset = info->push.count }; unsigned offset = binary->size; /* If there is no position shader (gl_Position is not written), then * there is no need to build a varying shader either. This case is hit * for transform feedback only vertex shaders which only make sense with * rasterizer discard. */ if ((offset == 0) && (idvs == BI_IDVS_VARYING)) return; /* Software invariant: Only a secondary shader can appear at a nonzero * offset, to keep the ABI simple. */ assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); bi_context *ctx = bi_compile_variant_nir(nir, inputs, binary, sysval_to_id, local_info, idvs); /* A register is preloaded <==> it is live before the first block */ bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); uint64_t preload = first_block->reg_live_in; /* If multisampling is used with a blend shader, the blend shader needs * to access the sample coverage mask in r60 and the sample ID in r61. * Blend shaders run in the same context as fragment shaders, so if a * blend shader could run, we need to preload these registers * conservatively. There is believed to be little cost to doing so, so * do so always to avoid variants of the preload descriptor. * * We only do this on Valhall, as Bifrost has to update the RSD for * multisampling w/ blend shader anyway, so this is handled in the * driver. We could unify the paths if the cost is acceptable. */ if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9) preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61); info->ubo_mask |= ctx->ubo_mask; info->tls_size = MAX2(info->tls_size, ctx->info.tls_size); if (idvs == BI_IDVS_VARYING) { info->vs.secondary_enable = (binary->size > offset); info->vs.secondary_offset = offset; info->vs.secondary_preload = preload; info->vs.secondary_work_reg_count = ctx->info.work_reg_count; } else { info->preload = preload; info->work_reg_count = ctx->info.work_reg_count; } if (idvs == BI_IDVS_POSITION && !nir->info.internal && nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) { /* Find the psiz write */ bi_instr *write = NULL; bi_foreach_instr_global(ctx, I) { if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) { write = I; break; } } assert(write != NULL); /* NOP it out, preserving its flow control. TODO: maybe DCE */ if (write->flow) { bi_builder b = bi_init_builder(ctx, bi_before_instr(write)); bi_instr *nop = bi_nop(&b); nop->flow = write->flow; } bi_remove_instruction(write); info->vs.no_psiz_offset = binary->size; bi_pack_valhall(ctx, binary); } ralloc_free(ctx); } /* Decide if Index-Driven Vertex Shading should be used for a given shader */ static bool bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs) { /* Opt-out */ if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS) return false; /* IDVS splits up vertex shaders, not defined on other shader stages */ if (nir->info.stage != MESA_SHADER_VERTEX) return false; /* Bifrost cannot write gl_PointSize during IDVS */ if ((inputs->gpu_id < 0x9000) && nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) return false; /* Otherwise, IDVS is usually better */ return true; } void bifrost_compile_shader_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct pan_shader_info *info) { bifrost_debug = debug_get_option_bifrost_debug(); bi_finalize_nir(nir, inputs->gpu_id, inputs->is_blend); struct hash_table_u64 *sysval_to_id = panfrost_init_sysvals(&info->sysvals, inputs->fixed_sysval_layout, NULL); info->tls_size = nir->scratch_size; info->vs.idvs = bi_should_idvs(nir, inputs); if (info->vs.idvs) { bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_POSITION); bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_VARYING); } else { bi_compile_variant(nir, inputs, binary, sysval_to_id, info, BI_IDVS_NONE); } if (gl_shader_stage_is_compute(nir->info.stage)) { /* Workgroups may be merged if the structure of the workgroup is * not software visible. This is true if neither shared memory * nor barriers are used. The hardware may be able to optimize * compute shaders that set this flag. */ info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) && !nir->info.uses_control_barrier && !nir->info.uses_memory_barrier; } info->ubo_mask &= (1 << nir->info.num_ubos) - 1; _mesa_hash_table_u64_destroy(sysval_to_id); }