1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include <stdint.h>
7 #include "compiler/glsl_types.h"
8 #include "util/format/u_format.h"
9 #include "util/macros.h"
10 #include "agx_nir_format_helpers.h"
11 #include "agx_tilebuffer.h"
12 #include "nir.h"
13 #include "nir_builder.h"
14 #include "nir_builder_opcodes.h"
15
16 #define AGX_NUM_TEXTURE_STATE_REGS 16
17 #define ALL_SAMPLES 0xFF
18
19 struct ctx {
20 struct agx_tilebuffer_layout *tib;
21 uint8_t *colormasks;
22 bool *translucent;
23 unsigned bindless_base;
24 bool any_memory_stores;
25 uint8_t outputs_written;
26 nir_def *write_samples;
27 };
28
29 static bool
tib_filter(const nir_instr * instr,UNUSED const void * _)30 tib_filter(const nir_instr *instr, UNUSED const void *_)
31 {
32 if (instr->type != nir_instr_type_intrinsic)
33 return false;
34
35 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
36 if (intr->intrinsic != nir_intrinsic_store_output &&
37 intr->intrinsic != nir_intrinsic_load_output)
38 return false;
39
40 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
41 assert(sem.dual_source_blend_index == 0 && "dual source blending lowered");
42 return (sem.location >= FRAG_RESULT_DATA0);
43 }
44
45 static void
store_tilebuffer(nir_builder * b,struct agx_tilebuffer_layout * tib,enum pipe_format format,enum pipe_format logical_format,unsigned rt,nir_def * value,nir_def * samples,unsigned write_mask)46 store_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib,
47 enum pipe_format format, enum pipe_format logical_format,
48 unsigned rt, nir_def *value, nir_def *samples,
49 unsigned write_mask)
50 {
51 /* The hardware cannot extend for a 32-bit format. Extend ourselves. */
52 if (format == PIPE_FORMAT_R32_UINT && value->bit_size == 16) {
53 if (util_format_is_pure_sint(logical_format))
54 value = nir_i2i32(b, value);
55 else if (util_format_is_pure_uint(logical_format))
56 value = nir_u2u32(b, value);
57 else
58 value = nir_f2f32(b, value);
59 }
60
61 /* Pure integer formatss need to be clamped in software, at least in some
62 * cases. We do so on store. Piglit gl-3.0-render-integer checks this, as
63 * does KHR-GL33.packed_pixels.*.
64 */
65 const struct util_format_description *desc =
66 util_format_description(logical_format);
67 unsigned c = util_format_get_first_non_void_channel(logical_format);
68
69 if (desc->channel[c].size <= 16 &&
70 util_format_is_pure_integer(logical_format)) {
71
72 unsigned bits[4] = {
73 desc->channel[0].size,
74 desc->channel[1].size,
75 desc->channel[2].size,
76 desc->channel[3].size,
77 };
78
79 if (util_format_is_pure_sint(logical_format))
80 value = nir_format_clamp_sint(b, value, bits);
81 else
82 value = nir_format_clamp_uint(b, value, bits);
83
84 value = nir_u2u16(b, value);
85 }
86
87 if (!samples)
88 samples = nir_imm_intN_t(b, ALL_SAMPLES, 16);
89
90 uint8_t offset_B = agx_tilebuffer_offset_B(tib, rt);
91 nir_store_local_pixel_agx(b, value, samples, nir_undef(b, 2, 16),
92 .base = offset_B, .write_mask = write_mask,
93 .format = format);
94 }
95
96 static nir_def *
nir_build_fsat_signed(nir_builder * b,nir_def * x)97 nir_build_fsat_signed(nir_builder *b, nir_def *x)
98 {
99 return nir_fclamp(b, x, nir_imm_floatN_t(b, -1.0, x->bit_size),
100 nir_imm_floatN_t(b, +1.0, x->bit_size));
101 }
102
103 static nir_def *
nir_fsat_to_format(nir_builder * b,nir_def * x,enum pipe_format format)104 nir_fsat_to_format(nir_builder *b, nir_def *x, enum pipe_format format)
105 {
106 if (util_format_is_unorm(format))
107 return nir_fsat(b, x);
108 else if (util_format_is_snorm(format))
109 return nir_build_fsat_signed(b, x);
110 else
111 return x;
112 }
113
114 static nir_def *
load_tilebuffer(nir_builder * b,struct agx_tilebuffer_layout * tib,uint8_t load_comps,uint8_t bit_size,unsigned rt,enum pipe_format format,enum pipe_format logical_format)115 load_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib,
116 uint8_t load_comps, uint8_t bit_size, unsigned rt,
117 enum pipe_format format, enum pipe_format logical_format)
118 {
119 unsigned comps = util_format_get_nr_components(logical_format);
120 bool f16 = (format == PIPE_FORMAT_R16_FLOAT);
121
122 /* Don't load with F16 */
123 if (f16)
124 format = PIPE_FORMAT_R16_UINT;
125
126 uint8_t offset_B = agx_tilebuffer_offset_B(tib, rt);
127 nir_def *res = nir_load_local_pixel_agx(
128 b, MIN2(load_comps, comps), f16 ? 16 : bit_size,
129 nir_imm_intN_t(b, ALL_SAMPLES, 16), .base = offset_B, .format = format);
130
131 /* Extend floats */
132 if (f16 && bit_size != 16) {
133 assert(bit_size == 32);
134 res = nir_f2f32(b, res);
135 }
136
137 /* Some formats like RGB565 are float in the tilebuffer but logically
138 * normalized. We need to clamp on load to get proper blending semantics, as
139 * the APIs require clamping here and nir_lower_blend (correctly) assumes
140 * load_output is clamped. The spilled path is unaffected as the clamping
141 * implicitly happens when roundtripping to memory.
142 */
143 if (f16)
144 res = nir_fsat_to_format(b, res, logical_format);
145
146 res = nir_sign_extend_if_sint(b, res, logical_format);
147 return nir_pad_vector(b, res, load_comps);
148 }
149
150 /*
151 * As a simple implementation, we use image load/store instructions to access
152 * spilled render targets. The driver will supply corresponding texture and PBE
153 * descriptors for each render target, accessed bindlessly
154 *
155 * Note that this lower happens after driver bindings are lowered, so the
156 * bindless handle is in the AGX-specific format.
157 */
158 static nir_def *
handle_for_rt(nir_builder * b,unsigned base,unsigned rt,bool pbe)159 handle_for_rt(nir_builder *b, unsigned base, unsigned rt, bool pbe)
160 {
161 unsigned index = base + (2 * rt) + (pbe ? 1 : 0);
162 return nir_load_texture_handle_agx(b, nir_imm_int(b, index));
163 }
164
165 static enum glsl_sampler_dim
dim_for_rt(nir_builder * b,unsigned nr_samples,nir_def ** sample)166 dim_for_rt(nir_builder *b, unsigned nr_samples, nir_def **sample)
167 {
168 if (nr_samples == 1) {
169 *sample = nir_imm_intN_t(b, 0, 16);
170 return GLSL_SAMPLER_DIM_2D;
171 } else {
172 *sample = nir_u2u16(b, nir_load_sample_id(b));
173 b->shader->info.fs.uses_sample_shading = true;
174 return GLSL_SAMPLER_DIM_MS;
175 }
176 }
177
178 static nir_def *
image_coords(nir_builder * b)179 image_coords(nir_builder *b)
180 {
181 nir_def *xy__ = nir_pad_vec4(b, nir_u2u32(b, nir_load_pixel_coord(b)));
182 return nir_vector_insert_imm(b, xy__, nir_load_layer_id(b), 2);
183 }
184
185 static void
store_memory(nir_builder * b,unsigned bindless_base,unsigned nr_samples,enum pipe_format format,unsigned rt,nir_def * value,nir_def * samples)186 store_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples,
187 enum pipe_format format, unsigned rt, nir_def *value,
188 nir_def *samples)
189 {
190 nir_def *image = handle_for_rt(b, bindless_base, rt, true);
191 nir_def *tex_image = handle_for_rt(b, bindless_base, rt, false);
192 nir_def *zero = nir_imm_intN_t(b, 0, 16);
193 nir_def *lod = zero;
194
195 nir_def *sample;
196 enum glsl_sampler_dim dim = dim_for_rt(b, nr_samples, &sample);
197 nir_def *coords = image_coords(b);
198
199 nir_def *size =
200 nir_bindless_image_size(b, 3, 32, tex_image, nir_imm_int(b, 0),
201 .image_array = true, .image_dim = dim);
202
203 nir_begin_invocation_interlock(b);
204
205 /* XXX: We should not get out-of-bounds image coords. Yet here we are :-/
206 *
207 * Fixes faults in:
208 *
209 * dEQP-VK.pipeline.monolithic.multisample.misc.dynamic_rendering.multi_renderpass.r8g8b8a8_unorm_r16g16b16a16_sfloat_r32g32b32a32_uint_d16_unorm.random_68
210 *
211 * which hits eMRT with multisampled image stores on an odd framebuffer size,
212 * and we get coordinates that go all the way up to align((width,height),
213 * (32,32)) despite setting scissor and such.
214 *
215 * XXX: needs more investigation, macOS seems to not choke on this so what
216 * are we doing wrong?
217 */
218 nir_def *cond = nir_ball(b, nir_ult(b, nir_trim_vector(b, coords, 2),
219 nir_trim_vector(b, size, 2)));
220
221 if (nr_samples > 1) {
222 nir_def *coverage = nir_load_sample_mask(b);
223
224 if (samples != NULL)
225 coverage = nir_iand(b, coverage, nir_u2u32(b, samples));
226
227 nir_def *covered = nir_ubitfield_extract(
228 b, coverage, nir_u2u32(b, sample), nir_imm_int(b, 1));
229
230 cond = nir_iand(b, cond, nir_ine_imm(b, covered, 0));
231 } else if (samples != NULL) {
232 cond = nir_iand(b, cond, nir_ine_imm(b, samples, 0));
233 }
234
235 nir_push_if(b, cond);
236 {
237 nir_bindless_image_store(b, image, coords, sample, value, lod,
238 .image_dim = dim, .image_array = true,
239 .format = format);
240 }
241 nir_pop_if(b, NULL);
242 }
243
244 static nir_def *
load_memory(nir_builder * b,unsigned bindless_base,unsigned nr_samples,uint8_t comps,uint8_t bit_size,unsigned rt,enum pipe_format format)245 load_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples,
246 uint8_t comps, uint8_t bit_size, unsigned rt,
247 enum pipe_format format)
248 {
249 nir_def *image = handle_for_rt(b, bindless_base, rt, false);
250 nir_def *zero = nir_imm_intN_t(b, 0, 16);
251 nir_def *lod = zero;
252
253 nir_def *sample;
254 enum glsl_sampler_dim dim = dim_for_rt(b, nr_samples, &sample);
255 nir_def *coords = image_coords(b);
256
257 /* Ensure pixels below this one have written out their results */
258 nir_begin_invocation_interlock(b);
259
260 return nir_bindless_image_load(
261 b, comps, bit_size, image, coords, sample, lod, .image_dim = dim,
262 .image_array = true, .format = format, .access = ACCESS_IN_BOUNDS_AGX);
263 }
264
265 static nir_def *
tib_impl(nir_builder * b,nir_instr * instr,void * data)266 tib_impl(nir_builder *b, nir_instr *instr, void *data)
267 {
268 struct ctx *ctx = data;
269 struct agx_tilebuffer_layout *tib = ctx->tib;
270 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
271
272 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
273 unsigned rt = sem.location - FRAG_RESULT_DATA0;
274 assert(rt < ARRAY_SIZE(tib->logical_format));
275
276 enum pipe_format logical_format = tib->logical_format[rt];
277 enum pipe_format format = agx_tilebuffer_physical_format(tib, rt);
278 unsigned comps = util_format_get_nr_components(logical_format);
279
280 if (intr->intrinsic == nir_intrinsic_store_output) {
281 ctx->outputs_written |= BITFIELD_BIT(rt);
282
283 /* Only write components that actually exist */
284 uint16_t write_mask = (uint16_t)BITFIELD_MASK(comps);
285
286 /* Delete stores to nonexistent render targets */
287 if (logical_format == PIPE_FORMAT_NONE)
288 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
289
290 /* Only write colours masked by the blend state */
291 if (ctx->colormasks)
292 write_mask &= ctx->colormasks[rt];
293
294 /* Masked stores require a translucent pass type */
295 if (write_mask != BITFIELD_MASK(comps)) {
296 assert(ctx->translucent != NULL &&
297 "colour masking requires translucency");
298
299 assert(agx_tilebuffer_supports_mask(tib, rt));
300 *(ctx->translucent) = true;
301 }
302
303 if (ctx->write_samples) {
304 assert(ctx->translucent != NULL &&
305 "sample masking requires translucency");
306
307 *(ctx->translucent) = true;
308 }
309
310 /* But we ignore the NIR write mask for that, since it's basically an
311 * optimization hint.
312 */
313 if (agx_tilebuffer_supports_mask(tib, rt))
314 write_mask &= nir_intrinsic_write_mask(intr);
315
316 /* Delete stores that are entirely masked out */
317 if (!write_mask)
318 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
319
320 nir_def *value = intr->src[0].ssa;
321
322 /* Trim to format as required by hardware */
323 value = nir_trim_vector(b, intr->src[0].ssa, comps);
324
325 if (tib->spilled[rt]) {
326 store_memory(b, ctx->bindless_base, tib->nr_samples, logical_format,
327 rt, value, ctx->write_samples);
328 ctx->any_memory_stores = true;
329 } else {
330 store_tilebuffer(b, tib, format, logical_format, rt, value,
331 ctx->write_samples, write_mask);
332 }
333
334 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
335 } else {
336 uint8_t bit_size = intr->def.bit_size;
337
338 /* Loads from non-existent render targets are undefined in NIR but not
339 * possible to encode in the hardware, delete them.
340 */
341 if (logical_format == PIPE_FORMAT_NONE) {
342 return nir_undef(b, intr->num_components, bit_size);
343 } else if (tib->spilled[rt]) {
344 *(ctx->translucent) = true;
345
346 return load_memory(b, ctx->bindless_base, tib->nr_samples,
347 intr->num_components, bit_size, rt, logical_format);
348 } else {
349 return load_tilebuffer(b, tib, intr->num_components, bit_size, rt,
350 format, logical_format);
351 }
352 }
353 }
354
355 bool
agx_nir_lower_tilebuffer(nir_shader * shader,struct agx_tilebuffer_layout * tib,uint8_t * colormasks,unsigned * bindless_base,nir_def * write_samples,bool * translucent)356 agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib,
357 uint8_t *colormasks, unsigned *bindless_base,
358 nir_def *write_samples, bool *translucent)
359 {
360 assert(shader->info.stage == MESA_SHADER_FRAGMENT);
361
362 struct ctx ctx = {
363 .tib = tib,
364 .colormasks = colormasks,
365 .translucent = translucent,
366 .write_samples = write_samples,
367 };
368
369 /* Allocate 1 texture + 1 PBE descriptor for each spilled descriptor */
370 if (agx_tilebuffer_spills(tib)) {
371 assert(bindless_base != NULL && "must be specified if spilling");
372 ctx.bindless_base = *bindless_base;
373 *bindless_base += (AGX_MAX_RENDER_TARGETS * 2);
374 }
375
376 bool progress =
377 nir_shader_lower_instructions(shader, tib_filter, tib_impl, &ctx);
378
379 /* Flush at end */
380 if (ctx.any_memory_stores) {
381 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
382 nir_builder b = nir_builder_at(nir_after_impl(impl));
383 nir_fence_pbe_to_tex_pixel_agx(&b);
384 }
385
386 /* If there are any render targets bound to the framebuffer that aren't
387 * statically written by the fragment shader, that acts as an implicit mask
388 * and requires translucency.
389 *
390 * XXX: Could be optimized.
391 */
392 for (unsigned i = 0; i < ARRAY_SIZE(tib->logical_format); ++i) {
393 bool exists = tib->logical_format[i] != PIPE_FORMAT_NONE;
394 bool written = ctx.outputs_written & BITFIELD_BIT(i);
395
396 if (translucent)
397 *translucent |= (exists && !written);
398 }
399
400 return progress;
401 }
402