1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "gallium/auxiliary/nir/pipe_nir.h"
8 #define AC_SURFACE_INCLUDE_NIR
9 #include "ac_surface.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12
13 #include "nir_format_convert.h"
14
create_shader_state(struct si_context * sctx,nir_shader * nir)15 static void *create_shader_state(struct si_context *sctx, nir_shader *nir)
16 {
17 sctx->b.screen->finalize_nir(sctx->b.screen, (void*)nir);
18 return pipe_shader_from_nir(&sctx->b, nir);
19 }
20
get_global_ids(nir_builder * b,unsigned num_components)21 static nir_def *get_global_ids(nir_builder *b, unsigned num_components)
22 {
23 unsigned mask = BITFIELD_MASK(num_components);
24
25 nir_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask);
26 nir_def *block_ids = nir_channels(b, nir_load_workgroup_id(b), mask);
27 nir_def *block_size = nir_channels(b, nir_load_workgroup_size(b), mask);
28 return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids);
29 }
30
31 /* unpack_2x16(src, x, y): x = src & 0xffff; y = src >> 16; */
unpack_2x16(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)32 static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
33 {
34 *x = nir_iand_imm(b, src, 0xffff);
35 *y = nir_ushr_imm(b, src, 16);
36 }
37
38 /* unpack_2x16_signed(src, x, y): x = (int32_t)((uint16_t)src); y = src >> 16; */
unpack_2x16_signed(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)39 static void unpack_2x16_signed(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
40 {
41 *x = nir_i2i32(b, nir_u2u16(b, src));
42 *y = nir_ishr_imm(b, src, 16);
43 }
44
45 static nir_def *
deref_ssa(nir_builder * b,nir_variable * var)46 deref_ssa(nir_builder *b, nir_variable *var)
47 {
48 return &nir_build_deref_var(b, var)->def;
49 }
50
51 /* Create a NIR compute shader implementing copy_image.
52 *
53 * This shader can handle 1D and 2D, linear and non-linear images.
54 * It expects the source and destination (x,y,z) coords as user_data_amd,
55 * packed into 3 SGPRs as 2x16bits per component.
56 */
si_create_copy_image_cs(struct si_context * sctx,unsigned wg_dim,bool src_is_1d_array,bool dst_is_1d_array)57 void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
58 bool src_is_1d_array, bool dst_is_1d_array)
59 {
60 const nir_shader_compiler_options *options =
61 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
62
63 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_cs");
64 b.shader->info.num_images = 2;
65
66 /* The workgroup size is either 8x8 for normal (non-linear) 2D images,
67 * or 64x1 for 1D and linear-2D images.
68 */
69 b.shader->info.workgroup_size_variable = true;
70
71 b.shader->info.cs.user_data_components_amd = 3;
72 nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
73
74 nir_def *coord_src = NULL, *coord_dst = NULL;
75 unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
76 &coord_src, &coord_dst);
77
78 coord_src = nir_iadd(&b, coord_src, ids);
79 coord_dst = nir_iadd(&b, coord_dst, ids);
80
81 /* Coordinates must have 4 channels in NIR. */
82 coord_src = nir_pad_vector(&b, coord_src, 4);
83 coord_dst = nir_pad_vector(&b, coord_dst, 4);
84
85 static unsigned swizzle_xz[] = {0, 2, 0, 0};
86
87 if (src_is_1d_array)
88 coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
89 if (dst_is_1d_array)
90 coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
91
92 const struct glsl_type *src_img_type = glsl_image_type(src_is_1d_array ? GLSL_SAMPLER_DIM_1D
93 : GLSL_SAMPLER_DIM_2D,
94 /*is_array*/ true, GLSL_TYPE_FLOAT);
95 const struct glsl_type *dst_img_type = glsl_image_type(dst_is_1d_array ? GLSL_SAMPLER_DIM_1D
96 : GLSL_SAMPLER_DIM_2D,
97 /*is_array*/ true, GLSL_TYPE_FLOAT);
98
99 nir_variable *img_src = nir_variable_create(b.shader, nir_var_image, src_img_type, "img_src");
100 img_src->data.binding = 0;
101
102 nir_variable *img_dst = nir_variable_create(b.shader, nir_var_image, dst_img_type, "img_dst");
103 img_dst->data.binding = 1;
104
105 nir_def *undef32 = nir_undef(&b, 1, 32);
106 nir_def *zero = nir_imm_int(&b, 0);
107
108 nir_def *data = nir_image_deref_load(&b, /*num_components*/ 4, /*bit_size*/ 32,
109 deref_ssa(&b, img_src), coord_src, undef32, zero);
110
111 nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, undef32, data, zero);
112
113 return create_shader_state(sctx, b.shader);
114 }
115
si_create_dcc_retile_cs(struct si_context * sctx,struct radeon_surf * surf)116 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
117 {
118 const nir_shader_compiler_options *options =
119 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
120
121 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "dcc_retile");
122 b.shader->info.workgroup_size[0] = 8;
123 b.shader->info.workgroup_size[1] = 8;
124 b.shader->info.workgroup_size[2] = 1;
125 b.shader->info.cs.user_data_components_amd = 3;
126 b.shader->info.num_ssbos = 1;
127
128 /* Get user data SGPRs. */
129 nir_def *user_sgprs = nir_load_user_data_amd(&b);
130
131 /* Relative offset from the displayable DCC to the non-displayable DCC in the same buffer. */
132 nir_def *src_dcc_offset = nir_channel(&b, user_sgprs, 0);
133
134 nir_def *src_dcc_pitch, *dst_dcc_pitch, *src_dcc_height, *dst_dcc_height;
135 unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &src_dcc_pitch, &src_dcc_height);
136 unpack_2x16(&b, nir_channel(&b, user_sgprs, 2), &dst_dcc_pitch, &dst_dcc_height);
137
138 /* Get the 2D coordinates. */
139 nir_def *coord = get_global_ids(&b, 2);
140 nir_def *zero = nir_imm_int(&b, 0);
141
142 /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
143 coord = nir_imul(&b, coord, nir_imm_ivec2(&b, surf->u.gfx9.color.dcc_block_width,
144 surf->u.gfx9.color.dcc_block_height));
145
146 nir_def *src_offset =
147 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.dcc_equation,
148 src_dcc_pitch, src_dcc_height, zero, /* DCC slice size */
149 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
150 zero, zero, zero); /* z, sample, pipe_xor */
151 src_offset = nir_iadd(&b, src_offset, src_dcc_offset);
152 nir_def *value = nir_load_ssbo(&b, 1, 8, zero, src_offset, .align_mul=1);
153
154 nir_def *dst_offset =
155 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.display_dcc_equation,
156 dst_dcc_pitch, dst_dcc_height, zero, /* DCC slice size */
157 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
158 zero, zero, zero); /* z, sample, pipe_xor */
159 nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1);
160
161 return create_shader_state(sctx, b.shader);
162 }
163
gfx9_create_clear_dcc_msaa_cs(struct si_context * sctx,struct si_texture * tex)164 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
165 {
166 const nir_shader_compiler_options *options =
167 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
168
169 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa");
170 b.shader->info.workgroup_size[0] = 8;
171 b.shader->info.workgroup_size[1] = 8;
172 b.shader->info.workgroup_size[2] = 1;
173 b.shader->info.cs.user_data_components_amd = 2;
174 b.shader->info.num_ssbos = 1;
175
176 /* Get user data SGPRs. */
177 nir_def *user_sgprs = nir_load_user_data_amd(&b);
178 nir_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
179 unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
180 unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
181 clear_value = nir_u2u16(&b, clear_value);
182
183 /* Get the 2D coordinates. */
184 nir_def *coord = get_global_ids(&b, 3);
185 nir_def *zero = nir_imm_int(&b, 0);
186
187 /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
188 coord = nir_imul(&b, coord,
189 nir_imm_ivec3(&b, tex->surface.u.gfx9.color.dcc_block_width,
190 tex->surface.u.gfx9.color.dcc_block_height,
191 tex->surface.u.gfx9.color.dcc_block_depth));
192
193 nir_def *offset =
194 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
195 &tex->surface.u.gfx9.color.dcc_equation,
196 dcc_pitch, dcc_height, zero, /* DCC slice size */
197 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
198 tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
199 zero, pipe_xor); /* sample, pipe_xor */
200
201 /* The trick here is that DCC elements for an even and the next odd sample are next to each other
202 * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
203 * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
204 */
205 nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
206
207 return create_shader_state(sctx, b.shader);
208 }
209
210 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_clear_buffer_rmw_cs(struct si_context * sctx)211 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
212 {
213 const nir_shader_compiler_options *options =
214 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
215
216 nir_builder b =
217 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_buffer_rmw_cs");
218 b.shader->info.workgroup_size[0] = 64;
219 b.shader->info.workgroup_size[1] = 1;
220 b.shader->info.workgroup_size[2] = 1;
221 b.shader->info.cs.user_data_components_amd = 2;
222 b.shader->info.num_ssbos = 1;
223
224 /* address = blockID * 64 + threadID; */
225 nir_def *address = get_global_ids(&b, 1);
226
227 /* address = address * 16; (byte offset, loading one vec4 per thread) */
228 address = nir_ishl_imm(&b, address, 4);
229
230 nir_def *zero = nir_imm_int(&b, 0);
231 nir_def *data = nir_load_ssbo(&b, 4, 32, zero, address, .align_mul = 4);
232
233 /* Get user data SGPRs. */
234 nir_def *user_sgprs = nir_load_user_data_amd(&b);
235
236 /* data &= inverted_writemask; */
237 data = nir_iand(&b, data, nir_channel(&b, user_sgprs, 1));
238 /* data |= clear_value_masked; */
239 data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
240
241 nir_store_ssbo(&b, data, zero, address,
242 .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
243 .align_mul = 4);
244
245 return create_shader_state(sctx, b.shader);
246 }
247
248 /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
249 * VS passes its outputs to TES directly, so the fixed-function shader only
250 * has to write TESSOUTER and TESSINNER.
251 */
si_create_passthrough_tcs(struct si_context * sctx)252 void *si_create_passthrough_tcs(struct si_context *sctx)
253 {
254 const nir_shader_compiler_options *options =
255 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR,
256 PIPE_SHADER_TESS_CTRL);
257
258 unsigned locations[PIPE_MAX_SHADER_OUTPUTS];
259
260 struct si_shader_info *info = &sctx->shader.vs.cso->info;
261 for (unsigned i = 0; i < info->num_outputs; i++) {
262 locations[i] = info->output_semantic[i];
263 }
264
265 nir_shader *tcs =
266 nir_create_passthrough_tcs_impl(options, locations, info->num_outputs,
267 sctx->patch_vertices);
268
269 return create_shader_state(sctx, tcs);
270 }
271
convert_linear_to_srgb(nir_builder * b,nir_def * input)272 static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input)
273 {
274 /* There are small precision differences compared to CB, so the gfx blit will return slightly
275 * different results.
276 */
277
278 nir_def *comp[4];
279 for (unsigned i = 0; i < 3; i++)
280 comp[i] = nir_format_linear_to_srgb(b, nir_channel(b, input, i));
281 comp[3] = nir_channel(b, input, 3);
282
283 return nir_vec(b, comp, 4);
284 }
285
average_samples(nir_builder * b,nir_def ** samples,unsigned num_samples)286 static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
287 {
288 /* This works like add-reduce by computing the sum of each pair independently, and then
289 * computing the sum of each pair of sums, and so on, to get better instruction-level
290 * parallelism.
291 */
292 if (num_samples == 16) {
293 for (unsigned i = 0; i < 8; i++)
294 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
295 }
296 if (num_samples >= 8) {
297 for (unsigned i = 0; i < 4; i++)
298 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
299 }
300 if (num_samples >= 4) {
301 for (unsigned i = 0; i < 2; i++)
302 samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
303 }
304 if (num_samples >= 2)
305 samples[0] = nir_fadd(b, samples[0], samples[1]);
306
307 return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
308 }
309
image_resolve_msaa(struct si_screen * sscreen,nir_builder * b,nir_variable * img,unsigned num_samples,nir_def * coord)310 static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img,
311 unsigned num_samples, nir_def *coord)
312 {
313 nir_def *zero = nir_imm_int(b, 0);
314 nir_def *result = NULL;
315 nir_variable *var = NULL;
316
317 /* Gfx11 doesn't support samples_identical, so we can't use it. */
318 if (sscreen->info.gfx_level < GFX11) {
319 /* We need a local variable to get the result out of conditional branches in SSA. */
320 var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
321
322 /* If all samples are identical, load only sample 0. */
323 nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
324 result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
325 nir_store_var(b, var, result, 0xf);
326
327 nir_push_else(b, NULL);
328 }
329
330 nir_def *sample_index[16];
331 for (unsigned i = 0; i < num_samples; i++)
332 sample_index[i] = nir_imm_int(b, i);
333
334 /* We need to hide the constant sample indices behind the optimization barrier, otherwise
335 * LLVM doesn't put loads into the same clause.
336 *
337 * TODO: nir_group_loads could do this.
338 */
339 if (!sscreen->use_aco) {
340 for (unsigned i = 0; i < num_samples; i++)
341 sample_index[i] = nir_optimization_barrier_vgpr_amd(b, 32, sample_index[i]);
342 }
343
344 /* Load all samples. */
345 nir_def *samples[16];
346 for (unsigned i = 0; i < num_samples; i++) {
347 samples[i] = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
348 coord, sample_index[i], zero);
349 }
350
351 result = average_samples(b, samples, num_samples);
352
353 if (sscreen->info.gfx_level < GFX11) {
354 /* Exit the conditional branch and get the result out of the branch. */
355 nir_store_var(b, var, result, 0xf);
356 nir_pop_if(b, NULL);
357 result = nir_load_var(b, var);
358 }
359
360 return result;
361 }
362
apply_blit_output_modifiers(nir_builder * b,nir_def * color,const union si_compute_blit_shader_key * options)363 static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
364 const union si_compute_blit_shader_key *options)
365 {
366 if (options->sint_to_uint)
367 color = nir_imax(b, color, nir_imm_int(b, 0));
368
369 if (options->uint_to_sint)
370 color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
371
372 if (options->dst_is_srgb)
373 color = convert_linear_to_srgb(b, color);
374
375 nir_def *zero = nir_imm_int(b, 0);
376 nir_def *one = options->use_integer_one ? nir_imm_int(b, 1) : nir_imm_float(b, 1);
377
378 /* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving
379 * those channels.
380 */
381 for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++)
382 color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan);
383
384 /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
385 if (options->last_dst_channel < 3)
386 color = nir_trim_vector(b, color, options->last_dst_channel + 1);
387
388 /* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify
389 * the behavior of the whole shader by comparing it to the gfx blit.
390 */
391 if (options->fp16_rtz)
392 color = nir_f2f16_rtz(b, color);
393
394 return color;
395 }
396
397 /* The compute blit shader.
398 *
399 * Differences compared to u_blitter (the gfx blit):
400 * - u_blitter doesn't preserve NaNs, but the compute blit does
401 * - u_blitter has lower linear->SRGB precision because the CB block doesn't
402 * use FP32, but the compute blit does.
403 *
404 * Other than that, non-scaled blits are identical to u_blitter.
405 *
406 * Implementation details:
407 * - Out-of-bounds dst coordinates are not clamped at all. The hw drops
408 * out-of-bounds stores for us.
409 * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
410 * the image_size NIR intrinsic.
411 * - X/Y flipping just does this in the shader: -threadIDs - 1
412 * - MSAA copies are implemented but disabled because MSAA image stores don't
413 * work.
414 */
si_create_blit_cs(struct si_context * sctx,const union si_compute_blit_shader_key * options)415 void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options)
416 {
417 const nir_shader_compiler_options *nir_options =
418 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
419
420 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
421 "blit_non_scaled_cs");
422 b.shader->info.num_images = 2;
423 if (options->src_is_msaa)
424 BITSET_SET(b.shader->info.msaa_images, 0);
425 if (options->dst_is_msaa)
426 BITSET_SET(b.shader->info.msaa_images, 1);
427 /* TODO: 1D blits are 8x slower because the workgroup size is 8x8 */
428 b.shader->info.workgroup_size[0] = 8;
429 b.shader->info.workgroup_size[1] = 8;
430 b.shader->info.workgroup_size[2] = 1;
431 b.shader->info.cs.user_data_components_amd = 3;
432
433 const struct glsl_type *img_type[2] = {
434 glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D :
435 options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
436 /*is_array*/ true, GLSL_TYPE_FLOAT),
437 glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
438 options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
439 /*is_array*/ true, GLSL_TYPE_FLOAT),
440 };
441
442 nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
443 img_src->data.binding = 0;
444
445 nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
446 img_dst->data.binding = 1;
447
448 nir_def *zero = nir_imm_int(&b, 0);
449
450 /* Instructions. */
451 /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
452 nir_def *dst_xyz = nir_pad_vector_imm_int(&b, get_global_ids(&b, options->wg_dim), 0, 3);
453 nir_def *src_xyz = dst_xyz;
454
455 /* Flip src coordinates. */
456 for (unsigned i = 0; i < 2; i++) {
457 if (i ? options->flip_y : options->flip_x) {
458 /* x goes from 0 to (dim - 1).
459 * The flipped blit should load from -dim to -1.
460 * Therefore do: x = -x - 1;
461 */
462 nir_def *comp = nir_channel(&b, src_xyz, i);
463 comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
464 src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
465 }
466 }
467
468 /* Add box.xyz. */
469 nir_def *coord_src = NULL, *coord_dst = NULL;
470 unpack_2x16_signed(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
471 &coord_src, &coord_dst);
472 coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
473 coord_src = nir_iadd(&b, coord_src, src_xyz);
474
475 /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
476 if (options->xy_clamp_to_edge) {
477 unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3;
478 nir_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
479 dim = nir_channels(&b, dim, src_clamp_channels);
480
481 nir_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels);
482 coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0));
483 coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1));
484
485 for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++)
486 coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i);
487 }
488
489 /* Swizzle coordinates for 1D_ARRAY. */
490 static unsigned swizzle_xz[] = {0, 2, 0, 0};
491
492 if (options->src_is_1d)
493 coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
494 if (options->dst_is_1d)
495 coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
496
497 /* Coordinates must have 4 channels in NIR. */
498 coord_src = nir_pad_vector(&b, coord_src, 4);
499 coord_dst = nir_pad_vector(&b, coord_dst, 4);
500
501 /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
502
503 /* Execute the image loads and stores. */
504 unsigned num_samples = 1 << options->log2_samples;
505 nir_def *color;
506
507 if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
508 /* MSAA resolving (downsampling). */
509 assert(num_samples > 1);
510 color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src);
511 color = apply_blit_output_modifiers(&b, color, options);
512 nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
513
514 } else if (options->src_is_msaa && options->dst_is_msaa) {
515 /* MSAA copy. */
516 nir_def *color[16];
517 assert(num_samples > 1);
518 /* Group loads together and then stores. */
519 for (unsigned i = 0; i < num_samples; i++) {
520 color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
521 nir_imm_int(&b, i), zero);
522 }
523 for (unsigned i = 0; i < num_samples; i++)
524 color[i] = apply_blit_output_modifiers(&b, color[i], options);
525 for (unsigned i = 0; i < num_samples; i++) {
526 nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
527 nir_imm_int(&b, i), color[i], zero);
528 }
529 } else if (!options->src_is_msaa && options->dst_is_msaa) {
530 /* MSAA upsampling. */
531 assert(num_samples > 1);
532 color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
533 color = apply_blit_output_modifiers(&b, color, options);
534 for (unsigned i = 0; i < num_samples; i++) {
535 nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
536 nir_imm_int(&b, i), color, zero);
537 }
538 } else {
539 /* Non-MSAA copy or read sample 0 only. */
540 /* src2 = sample_index (zero), src3 = lod (zero) */
541 assert(num_samples == 1);
542 color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
543 color = apply_blit_output_modifiers(&b, color, options);
544 nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
545 }
546
547 return create_shader_state(sctx, b.shader);
548 }
549
si_clear_render_target_shader(struct si_context * sctx,enum pipe_texture_target type)550 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type)
551 {
552 nir_def *address;
553 enum glsl_sampler_dim sampler_type;
554
555 const nir_shader_compiler_options *options =
556 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
557
558 nir_builder b =
559 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_render_target");
560 b.shader->info.num_ubos = 1;
561 b.shader->info.num_images = 1;
562 b.shader->num_uniforms = 2;
563
564 switch (type) {
565 case PIPE_TEXTURE_1D_ARRAY:
566 b.shader->info.workgroup_size[0] = 64;
567 b.shader->info.workgroup_size[1] = 1;
568 b.shader->info.workgroup_size[2] = 1;
569 sampler_type = GLSL_SAMPLER_DIM_1D;
570 address = get_global_ids(&b, 2);
571 break;
572 case PIPE_TEXTURE_2D_ARRAY:
573 b.shader->info.workgroup_size[0] = 8;
574 b.shader->info.workgroup_size[1] = 8;
575 b.shader->info.workgroup_size[2] = 1;
576 sampler_type = GLSL_SAMPLER_DIM_2D;
577 address = get_global_ids(&b, 3);
578 break;
579 default:
580 unreachable("unsupported texture target type");
581 }
582
583 const struct glsl_type *img_type = glsl_image_type(sampler_type, true, GLSL_TYPE_FLOAT);
584 nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
585 output_img->data.image.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
586
587 nir_def *zero = nir_imm_int(&b, 0);
588 nir_def *ubo = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
589
590 /* TODO: No GL CTS tests for 1D arrays, relying on OpenCL CTS for now.
591 * As a sanity check, "OpenCL-CTS/test_conformance/images/clFillImage" tests should pass
592 */
593 if (type == PIPE_TEXTURE_1D_ARRAY) {
594 unsigned swizzle[4] = {0, 2, 0, 0};
595 ubo = nir_swizzle(&b, ubo, swizzle, 4);
596 }
597
598 address = nir_iadd(&b, address, ubo);
599 nir_def *coord = nir_pad_vector(&b, address, 4);
600
601 nir_def *data = nir_load_ubo(&b, 4, 32, zero, nir_imm_int(&b, 16), .range_base = 16, .range = 16);
602
603 nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, zero, data, zero,
604 .image_dim = sampler_type, .image_array = true);
605
606 return create_shader_state(sctx, b.shader);
607 }
608
si_clear_12bytes_buffer_shader(struct si_context * sctx)609 void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
610 {
611 const nir_shader_compiler_options *options =
612 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
613
614 nir_builder b =
615 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_12bytes_buffer");
616 b.shader->info.workgroup_size[0] = 64;
617 b.shader->info.workgroup_size[1] = 1;
618 b.shader->info.workgroup_size[2] = 1;
619 b.shader->info.cs.user_data_components_amd = 3;
620
621 nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
622 nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
623
624 nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
625 .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
626
627 return create_shader_state(sctx, b.shader);
628 }
629
si_create_ubyte_to_ushort_compute_shader(struct si_context * sctx)630 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
631 {
632 const nir_shader_compiler_options *options =
633 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
634
635 unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
636
637 /* Don't cache loads, because there is no reuse. */
638 unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
639
640 nir_builder b =
641 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort");
642
643 unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
644
645 b.shader->info.workgroup_size[0] = default_wave_size;
646 b.shader->info.workgroup_size[1] = 1;
647 b.shader->info.workgroup_size[2] = 1;
648 b.shader->info.num_ssbos = 2;
649
650 nir_def *load_address = get_global_ids(&b, 1);
651 nir_def *store_address = nir_imul_imm(&b, load_address, 2);
652
653 nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
654 load_address, .access = load_qualifier);
655 nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
656 store_address, .access = store_qualifier);
657
658 return create_shader_state(sctx, b.shader);
659 }
660
661 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_dma_compute_shader(struct si_context * sctx,unsigned num_dwords_per_thread,bool dst_stream_cache_policy,bool is_copy)662 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
663 bool dst_stream_cache_policy, bool is_copy)
664 {
665 assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
666
667 const nir_shader_compiler_options *options =
668 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
669
670 unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
671 if (dst_stream_cache_policy)
672 store_qualifier |= ACCESS_NON_TEMPORAL;
673
674 /* Don't cache loads, because there is no reuse. */
675 unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
676
677 nir_builder b =
678 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
679
680 unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
681
682 b.shader->info.workgroup_size[0] = default_wave_size;
683 b.shader->info.workgroup_size[1] = 1;
684 b.shader->info.workgroup_size[2] = 1;
685 b.shader->info.num_ssbos = 1;
686
687 unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
688 unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
689
690 for (unsigned i = 0; i < num_mem_ops; i++) {
691 if (i * 4 < num_dwords_per_thread)
692 inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
693 }
694
695 /* If there are multiple stores,
696 * the first store writes into 0 * wavesize + tid,
697 * the 2nd store writes into 1 * wavesize + tid,
698 * the 3rd store writes into 2 * wavesize + tid, etc.
699 */
700 nir_def *store_address = get_global_ids(&b, 1);
701
702 /* Convert from a "store size unit" into bytes. */
703 store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
704
705 nir_def *load_address = store_address, *value, *values[num_mem_ops];
706 value = nir_undef(&b, 1, 32);
707
708 if (is_copy) {
709 b.shader->info.num_ssbos++;
710 } else {
711 b.shader->info.cs.user_data_components_amd = inst_dwords[0];
712 value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
713 }
714
715 /* Distance between a load and a store for latency hiding. */
716 unsigned load_store_distance = is_copy ? 8 : 0;
717
718 for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
719 int d = i - load_store_distance;
720
721 if (is_copy && i < num_mem_ops) {
722 if (i) {
723 load_address = nir_iadd(&b, load_address,
724 nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
725 }
726 values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
727 .access = load_qualifier);
728 }
729
730 if (d >= 0) {
731 if (d) {
732 store_address = nir_iadd(&b, store_address,
733 nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
734 }
735 nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
736 .access = store_qualifier);
737 }
738 }
739
740 return create_shader_state(sctx, b.shader);
741 }
742
743 /* Load samples from the image, and copy them to the same image. This looks like
744 * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
745 * reordered to match expanded FMASK.
746 *
747 * After the shader finishes, FMASK should be cleared to identity.
748 */
si_create_fmask_expand_cs(struct si_context * sctx,unsigned num_samples,bool is_array)749 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array)
750 {
751 const nir_shader_compiler_options *options =
752 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
753
754 nir_builder b =
755 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_fmask_expand_cs");
756 b.shader->info.workgroup_size[0] = 8;
757 b.shader->info.workgroup_size[1] = 8;
758 b.shader->info.workgroup_size[2] = 1;
759
760 /* Return an empty compute shader */
761 if (num_samples == 0)
762 return create_shader_state(sctx, b.shader);
763
764 b.shader->info.num_images = 1;
765
766 const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_MS, is_array, GLSL_TYPE_FLOAT);
767 nir_variable *img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
768 img->data.access = ACCESS_RESTRICT;
769
770 nir_def *z = nir_undef(&b, 1, 32);
771 if (is_array) {
772 z = nir_channel(&b, nir_load_workgroup_id(&b), 2);
773 }
774
775 nir_def *zero = nir_imm_int(&b, 0);
776 nir_def *address = get_global_ids(&b, 2);
777
778 nir_def *sample[8], *addresses[8];
779 assert(num_samples <= ARRAY_SIZE(sample));
780
781 nir_def *img_def = &nir_build_deref_var(&b, img)->def;
782
783 /* Load samples, resolving FMASK. */
784 for (unsigned i = 0; i < num_samples; i++) {
785 nir_def *it = nir_imm_int(&b, i);
786 sample[i] = nir_vec4(&b, nir_channel(&b, address, 0), nir_channel(&b, address, 1), z, it);
787 addresses[i] = nir_image_deref_load(&b, 4, 32, img_def, sample[i], it, zero,
788 .access = ACCESS_RESTRICT,
789 .image_dim = GLSL_SAMPLER_DIM_2D,
790 .image_array = is_array);
791 }
792
793 /* Store samples, ignoring FMASK. */
794 for (unsigned i = 0; i < num_samples; i++) {
795 nir_image_deref_store(&b, img_def, sample[i], nir_imm_int(&b, i), addresses[i], zero,
796 .access = ACCESS_RESTRICT,
797 .image_dim = GLSL_SAMPLER_DIM_2D,
798 .image_array = is_array);
799 }
800
801 return create_shader_state(sctx, b.shader);
802 }
803
804 /* This is just a pass-through shader with 1-3 MOV instructions. */
si_get_blitter_vs(struct si_context * sctx,enum blitter_attrib_type type,unsigned num_layers)805 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
806 {
807 unsigned vs_blit_property;
808 void **vs;
809
810 switch (type) {
811 case UTIL_BLITTER_ATTRIB_NONE:
812 vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
813 vs_blit_property = SI_VS_BLIT_SGPRS_POS;
814 break;
815 case UTIL_BLITTER_ATTRIB_COLOR:
816 vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
817 vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
818 break;
819 case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
820 case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
821 assert(num_layers == 1);
822 vs = &sctx->vs_blit_texcoord;
823 vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
824 break;
825 default:
826 assert(0);
827 return NULL;
828 }
829
830 if (*vs)
831 return *vs;
832
833 /* Add 1 for the attribute ring address. */
834 if (sctx->gfx_level >= GFX11 && type != UTIL_BLITTER_ATTRIB_NONE)
835 vs_blit_property++;
836
837 const nir_shader_compiler_options *options =
838 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_VERTEX);
839
840 nir_builder b =
841 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options, "get_blitter_vs");
842
843 /* Tell the shader to load VS inputs from SGPRs: */
844 b.shader->info.vs.blit_sgprs_amd = vs_blit_property;
845 b.shader->info.vs.window_space_position = true;
846
847 const struct glsl_type *vec4 = glsl_vec4_type();
848
849 nir_copy_var(&b,
850 nir_create_variable_with_location(b.shader, nir_var_shader_out,
851 VARYING_SLOT_POS, vec4),
852 nir_create_variable_with_location(b.shader, nir_var_shader_in,
853 VERT_ATTRIB_GENERIC0, vec4));
854
855 if (type != UTIL_BLITTER_ATTRIB_NONE) {
856 nir_copy_var(&b,
857 nir_create_variable_with_location(b.shader, nir_var_shader_out,
858 VARYING_SLOT_VAR0, vec4),
859 nir_create_variable_with_location(b.shader, nir_var_shader_in,
860 VERT_ATTRIB_GENERIC1, vec4));
861 }
862
863 if (num_layers > 1) {
864 nir_variable *out_layer =
865 nir_create_variable_with_location(b.shader, nir_var_shader_out,
866 VARYING_SLOT_LAYER, glsl_int_type());
867 out_layer->data.interpolation = INTERP_MODE_NONE;
868
869 nir_copy_var(&b, out_layer,
870 nir_create_variable_with_location(b.shader, nir_var_system_value,
871 SYSTEM_VALUE_INSTANCE_ID, glsl_int_type()));
872 }
873
874 *vs = create_shader_state(sctx, b.shader);
875 return *vs;
876 }
877
878 /* Create the compute shader that is used to collect the results.
879 *
880 * One compute grid with a single thread is launched for every query result
881 * buffer. The thread (optionally) reads a previous summary buffer, then
882 * accumulates data from the query result buffer, and writes the result either
883 * to a summary buffer to be consumed by the next grid invocation or to the
884 * user-supplied buffer.
885 *
886 * Data layout:
887 *
888 * CONST
889 * 0.x = end_offset
890 * 0.y = result_stride
891 * 0.z = result_count
892 * 0.w = bit field:
893 * 1: read previously accumulated values
894 * 2: write accumulated values for chaining
895 * 4: write result available
896 * 8: convert result to boolean (0/1)
897 * 16: only read one dword and use that as result
898 * 32: apply timestamp conversion
899 * 64: store full 64 bits result
900 * 128: store signed 32 bits result
901 * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
902 * 1.x = fence_offset
903 * 1.y = pair_stride
904 * 1.z = pair_count
905 *
906 */
si_create_query_result_cs(struct si_context * sctx)907 void *si_create_query_result_cs(struct si_context *sctx)
908 {
909 const nir_shader_compiler_options *options =
910 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
911
912 nir_builder b =
913 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs");
914 b.shader->info.workgroup_size[0] = 1;
915 b.shader->info.workgroup_size[1] = 1;
916 b.shader->info.workgroup_size[2] = 1;
917 b.shader->info.num_ubos = 1;
918 b.shader->info.num_ssbos = 3;
919 b.shader->num_uniforms = 2;
920
921 nir_def *var_undef = nir_undef(&b, 1, 32);
922 nir_def *zero = nir_imm_int(&b, 0);
923 nir_def *one = nir_imm_int(&b, 1);
924 nir_def *two = nir_imm_int(&b, 2);
925 nir_def *four = nir_imm_int(&b, 4);
926 nir_def *eight = nir_imm_int(&b, 8);
927 nir_def *sixteen = nir_imm_int(&b, 16);
928 nir_def *thirty_one = nir_imm_int(&b, 31);
929 nir_def *sixty_four = nir_imm_int(&b, 64);
930
931 /* uint32_t x, y, z = 0; */
932 nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
933 nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
934 nir_store_var(&b, x, var_undef, 0x1);
935 nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
936 nir_store_var(&b, y, var_undef, 0x1);
937 nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
938 nir_store_var(&b, z, zero, 0x1);
939
940 /* uint32_t buff_0[4] = load_ubo(0, 0); */
941 nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
942 /* uint32_t buff_1[4] = load_ubo(1, 16); */
943 nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
944
945 /* uint32_t b0_bitfield = buff_0.w; */
946 nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
947
948 /* Check result availability.
949 * if (b0_bitfield & (1u << 4)) {
950 * ...
951 */
952 nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
953 nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
954
955 /* int32_t value = load_ssbo(0, fence_offset);
956 * z = ~(value >> 31);
957 */
958 nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
959 nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
960 nir_store_var(&b, z, bitmask, 0x1);
961
962 /* Load result if available.
963 * if (value < 0) {
964 * uint32_t result[2] = load_ssbo(0, 0);
965 * x = result[0];
966 * y = result[1];
967 * }
968 */
969 nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
970 nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
971 nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
972 nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
973 }
974 nir_pop_if(&b, if_negative);
975 } nir_push_else(&b, if_one_dword_result); {
976
977 /* } else {
978 * x = 0; y = 0;
979 */
980 nir_store_var(&b, x, zero, 0x1);
981 nir_store_var(&b, y, zero, 0x1);
982
983 /* Load previously accumulated result if requested.
984 * if (b0_bitfield & (1u << 0)) {
985 * uint32_t result[3] = load_ssbo(1, 0);
986 * x = result[0];
987 * y = result[1];
988 * z = result[2];
989 * }
990 */
991 nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
992 nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
993 nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
994 nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
995 nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
996 nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
997 }
998 nir_pop_if(&b, if_prev_acc_result);
999
1000 /* if (!z) {
1001 * uint32_t result_index = 0;
1002 * uint32_t pitch = 0;
1003 * ...
1004 */
1005 nir_def *z_value = nir_load_var(&b, z);
1006 nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
1007 nir_variable *outer_loop_iter =
1008 nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
1009 nir_store_var(&b, outer_loop_iter, zero, 0x1);
1010 nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
1011 nir_store_var(&b, pitch, zero, 0x1);
1012
1013 /* Outer loop.
1014 * while (result_index <= result_count) {
1015 * ...
1016 */
1017 nir_loop *loop_outer = nir_push_loop(&b); {
1018 nir_def *result_index = nir_load_var(&b, outer_loop_iter);
1019 nir_def *is_result_index_out_of_bound =
1020 nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
1021 nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
1022 nir_jump(&b, nir_jump_break);
1023 }
1024 nir_pop_if(&b, if_out_of_bound);
1025
1026 /* Load fence and check result availability.
1027 * pitch = i * result_stride;
1028 * uint32_t address = fence_offset + pitch;
1029 * int32_t value = load_ssbo(0, address);
1030 * z = ~(value >> 31);
1031 */
1032 nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
1033 nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
1034 nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
1035 nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
1036 nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
1037 nir_store_var(&b, z, bitmask, 0x1);
1038
1039 /* if (z) {
1040 * break;
1041 * }
1042 */
1043 nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
1044 nir_jump(&b, nir_jump_break);
1045 }
1046 nir_pop_if(&b, if_result_available);
1047
1048 /* Inner loop iterator.
1049 * uint32_t i = 0;
1050 */
1051 nir_variable *inner_loop_iter =
1052 nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
1053 nir_store_var(&b, inner_loop_iter, zero, 0x1);
1054
1055 /* Inner loop.
1056 * do {
1057 * ...
1058 */
1059 nir_loop *loop_inner = nir_push_loop(&b); {
1060 nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
1061 nir_def *i = nir_load_var(&b, inner_loop_iter);
1062
1063 /* Load start and end.
1064 * uint64_t first = load_ssbo(0, pitch);
1065 * uint64_t second = load_ssbo(0, pitch + end_offset);
1066 * uint64_t start_half_pair = second - first;
1067 */
1068 nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
1069 nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
1070 nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
1071 nir_def *start_half_pair = nir_isub(&b, second, first);
1072
1073 /* Load second start/end half-pair and take the difference.
1074 * if (b0_bitfield & (1u << 8)) {
1075 * uint64_t first = load_ssbo(0, pitch + 8);
1076 * uint64_t second = load_ssbo(0, pitch + end_offset + 8);
1077 * uint64_t end_half_pair = second - first;
1078 * uint64_t difference = start_half_pair - end_half_pair;
1079 * }
1080 */
1081 nir_def *difference;
1082 nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
1083 nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
1084 first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
1085 second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
1086 nir_def *end_half_pair = nir_isub(&b, second, first);
1087 difference = nir_isub(&b, start_half_pair, end_half_pair);
1088 }
1089 nir_pop_if(&b, if_so_overflow_mode);
1090
1091 /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
1092 nir_def *sum = nir_iadd(&b,
1093 nir_pack_64_2x32_split(&b,
1094 nir_load_var(&b, x),
1095 nir_load_var(&b, y)),
1096 nir_if_phi(&b, difference, start_half_pair));
1097 sum = nir_unpack_64_2x32(&b, sum);
1098
1099 /* Increment inner loop iterator.
1100 * i++;
1101 */
1102 i = nir_iadd(&b, i, one);
1103 nir_store_var(&b, inner_loop_iter, i, 0x1);
1104
1105 /* Update pitch value.
1106 * pitch = i * pair_stride + pitch;
1107 */
1108 nir_def *incremented_pitch = nir_iadd(&b,
1109 nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
1110 pitch_outer_loop);
1111 nir_store_var(&b, pitch, incremented_pitch, 0x1);
1112
1113 /* Update x and y.
1114 * x = sum.x;
1115 * y = sum.x >> 32;
1116 */
1117 nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
1118 nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
1119
1120 /* } while (i < pair_count);
1121 */
1122 nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
1123 nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
1124 nir_jump(&b, nir_jump_break);
1125 }
1126 nir_pop_if(&b, if_pair_count_exceeded);
1127 }
1128 nir_pop_loop(&b, loop_inner);
1129
1130 /* Increment pair iterator.
1131 * result_index++;
1132 */
1133 nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
1134 }
1135 nir_pop_loop(&b, loop_outer);
1136 }
1137 nir_pop_if(&b, if_not_z);
1138 }
1139 nir_pop_if(&b, if_one_dword_result);
1140
1141 nir_def *x_value = nir_load_var(&b, x);
1142 nir_def *y_value = nir_load_var(&b, y);
1143 nir_def *z_value = nir_load_var(&b, z);
1144
1145 /* Store accumulated data for chaining.
1146 * if (b0_bitfield & (1u << 1)) {
1147 * store_ssbo(<x, y, z>, 2, 0);
1148 */
1149 nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
1150 nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
1151 nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
1152 } nir_push_else(&b, if_acc_chaining); {
1153
1154 /* Store result availability.
1155 * } else {
1156 * if (b0_bitfield & (1u << 2)) {
1157 * store_ssbo((~z & 1), 2, 0);
1158 * ...
1159 */
1160 nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
1161 nir_if *if_result_available = nir_push_if(&b, is_result_available); {
1162 nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
1163
1164 /* Store full 64 bits result.
1165 * if (b0_bitfield & (1u << 6)) {
1166 * store_ssbo(<0, 0>, 2, 0);
1167 * }
1168 */
1169 nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
1170 nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
1171 nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
1172 .write_mask = (1u << 1));
1173 }
1174 nir_pop_if(&b, if_result_64_bits);
1175 } nir_push_else(&b, if_result_available); {
1176
1177 /* } else {
1178 * if (~z) {
1179 * ...
1180 */
1181 nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
1182 nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
1183 nir_def *ts_x, *ts_y;
1184
1185 /* Apply timestamp conversion.
1186 * if (b0_bitfield & (1u << 5)) {
1187 * uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
1188 * uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
1189 * x = ts_converted.x;
1190 * y = ts_converted.x >> 32;
1191 * }
1192 */
1193 nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
1194 nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
1195 /* Add the frequency into the shader for timestamp conversion
1196 * so that the backend can use the full range of optimizations
1197 * for divide-by-constant.
1198 */
1199 nir_def *clock_crystal_frequency =
1200 nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
1201
1202 nir_def *xy_million = nir_imul(&b,
1203 nir_pack_64_2x32_split(&b, x_value, y_value),
1204 nir_imm_int64(&b, 1000000));
1205 nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
1206 ts_converted = nir_unpack_64_2x32(&b, ts_converted);
1207 ts_x = nir_channel(&b, ts_converted, 0);
1208 ts_y = nir_channel(&b, ts_converted, 1);
1209 }
1210 nir_pop_if(&b, if_apply_timestamp);
1211
1212 nir_def *nx = nir_if_phi(&b, ts_x, x_value);
1213 nir_def *ny = nir_if_phi(&b, ts_y, y_value);
1214
1215 /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
1216 * y = b0_bitfield & (1u << 3) ? 0 : y;
1217 */
1218 nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
1219 nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
1220 nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
1221 nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
1222 ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
1223
1224 /* if (b0_bitfield & (1u << 6)) {
1225 * store_ssbo(<x, y>, 2, 0);
1226 * }
1227 */
1228 nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
1229 nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
1230 nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
1231 } nir_push_else(&b, if_result_64_bits); {
1232
1233 /* Clamping.
1234 * } else {
1235 * x = y ? UINT32_MAX : x;
1236 * x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
1237 * store_ssbo(x, 2, 0);
1238 * }
1239 */
1240 nir_def *is_y = nir_ine(&b, ny, zero);
1241 nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
1242 nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
1243 nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
1244 nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
1245 nir_store_ssbo(&b, nx, two, zero);
1246 }
1247 nir_pop_if(&b, if_result_64_bits);
1248 }
1249 nir_pop_if(&b, if_bitwise_not_z);
1250 }
1251 nir_pop_if(&b, if_result_available);
1252 }
1253 nir_pop_if(&b, if_acc_chaining);
1254
1255 return create_shader_state(sctx, b.shader);
1256 }
1257
1258 /* Create the compute shader that is used to collect the results of gfx10+
1259 * shader queries.
1260 *
1261 * One compute grid with a single thread is launched for every query result
1262 * buffer. The thread (optionally) reads a previous summary buffer, then
1263 * accumulates data from the query result buffer, and writes the result either
1264 * to a summary buffer to be consumed by the next grid invocation or to the
1265 * user-supplied buffer.
1266 *
1267 * Data layout:
1268 *
1269 * CONST
1270 * 0.x = config;
1271 * [0:2] the low 3 bits indicate the mode:
1272 * 0: sum up counts
1273 * 1: determine result availability and write it as a boolean
1274 * 2: SO_OVERFLOW
1275 * 3: SO_ANY_OVERFLOW
1276 * the remaining bits form a bitfield:
1277 * 8: write result as a 64-bit value
1278 * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
1279 * 0.z = chain bit field:
1280 * 1: have previous summary buffer
1281 * 2: write next summary buffer
1282 * 0.w = result_count
1283 */
gfx11_create_sh_query_result_cs(struct si_context * sctx)1284 void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
1285 {
1286 const nir_shader_compiler_options *options =
1287 sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
1288
1289 nir_builder b =
1290 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "gfx11_create_sh_query_result_cs");
1291 b.shader->info.workgroup_size[0] = 1;
1292 b.shader->info.workgroup_size[1] = 1;
1293 b.shader->info.workgroup_size[2] = 1;
1294 b.shader->info.num_ubos = 1;
1295 b.shader->info.num_ssbos = 3;
1296 b.shader->num_uniforms = 1;
1297
1298 nir_def *zero = nir_imm_int(&b, 0);
1299 nir_def *one = nir_imm_int(&b, 1);
1300 nir_def *two = nir_imm_int(&b, 2);
1301 nir_def *four = nir_imm_int(&b, 4);
1302 nir_def *minus_one = nir_imm_int(&b, 0xffffffff);
1303
1304 /* uint32_t acc_result = 0, acc_missing = 0; */
1305 nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
1306 nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result");
1307 nir_store_var(&b, acc_result, zero, 0x1);
1308 nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing");
1309 nir_store_var(&b, acc_missing, zero, 0x1);
1310
1311 /* uint32_t buff_0[4] = load_ubo(0, 0); */
1312 nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
1313
1314 /* if((chain & 1) {
1315 * uint32_t result[2] = load_ssbo(1, 0);
1316 * acc_result = result[0];
1317 * acc_missing = result[1];
1318 * }
1319 */
1320 nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one));
1321 nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); {
1322 nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero);
1323 nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1);
1324 nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1);
1325 }
1326 nir_pop_if(&b, if_prev_summary_buffer);
1327
1328 /* uint32_t mode = config & 0b111;
1329 * bool is_overflow = mode >= 2;
1330 */
1331 nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111);
1332 nir_def *is_overflow = nir_uge(&b, mode, two);
1333
1334 /* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */
1335 nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining");
1336 nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset");
1337 nir_def *state = nir_iand(&b,
1338 nir_isub(&b, zero, nir_b2i32(&b, is_overflow)),
1339 nir_load_var(&b, acc_result));
1340 nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3));
1341 nir_store_var(&b, result_remaining, value, 0x1);
1342
1343 /* uint32_t base_offset = 0; */
1344 nir_store_var(&b, base_offset, zero, 0x1);
1345
1346 /* Outer loop begin.
1347 * while (!result_remaining) {
1348 * ...
1349 */
1350 nir_loop *loop_outer = nir_push_loop(&b); {
1351 nir_def *condition = nir_load_var(&b, result_remaining);
1352 nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); {
1353 nir_jump(&b, nir_jump_break);
1354 }
1355 nir_pop_if(&b, if_not_condition);
1356
1357 /* result_remaining--; */
1358 condition = nir_iadd(&b, condition, minus_one);
1359 nir_store_var(&b, result_remaining, condition, 0x1);
1360
1361 /* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */
1362 nir_def *b_offset = nir_load_var(&b, base_offset);
1363 uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream);
1364 nir_def *fence = nir_load_ssbo(&b, 1, 32, zero,
1365 nir_iadd_imm(&b, b_offset, buffer_mem_stream_size));
1366
1367 /* if (!fence) {
1368 * acc_missing = ~0u;
1369 * break;
1370 * }
1371 */
1372 nir_def *is_zero = nir_ieq(&b, fence, zero);
1373 nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero));
1374 nir_store_var(&b, acc_missing, y_value, 0x1);
1375 nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); {
1376 nir_jump(&b, nir_jump_break);
1377 }
1378 nir_pop_if(&b, if_ssbo_zero);
1379
1380 /* stream_offset = base_offset + offset; */
1381 nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1));
1382
1383 /* if (!(config & 7)) {
1384 * acc_result += buffer[0]@stream_offset;
1385 * }
1386 */
1387 nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); {
1388 nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset);
1389 x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value);
1390 nir_store_var(&b, acc_result, x_value, 0x1);
1391 }
1392 nir_pop_if(&b, if_sum_up_counts);
1393
1394 /* if (is_overflow) {
1395 * uint32_t count = (config & 1) ? 4 : 1;
1396 * ...
1397 */
1398 nir_if *if_overflow = nir_push_if(&b, is_overflow); {
1399 nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one));
1400 nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one);
1401
1402 nir_variable *count =
1403 nir_local_variable_create(e, glsl_uint_type(), "count");
1404 nir_store_var(&b, count, initial_count, 0x1);
1405
1406 nir_variable *stream_offset =
1407 nir_local_variable_create(e, glsl_uint_type(), "stream_offset");
1408 nir_store_var(&b, stream_offset, s_offset, 0x1);
1409
1410 /* Inner loop begin.
1411 * do {
1412 * ...
1413 */
1414 nir_loop *loop_inner = nir_push_loop(&b); {
1415 /* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */
1416 nir_def *stream_offset_value = nir_load_var(&b, stream_offset);
1417 nir_def *buffer =
1418 nir_load_ssbo(&b, 4, 32, zero,
1419 nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t)));
1420
1421 /* if (generated != emitted) {
1422 * acc_result = 1;
1423 * base_offset = 0;
1424 * break;
1425 * }
1426 */
1427 nir_def *generated = nir_channel(&b, buffer, 0);
1428 nir_def *emitted = nir_channel(&b, buffer, 2);
1429 nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); {
1430 nir_store_var(&b, acc_result, one, 0x1);
1431 nir_store_var(&b, base_offset, zero, 0x1);
1432 nir_jump(&b, nir_jump_break);
1433 }
1434 nir_pop_if(&b, if_not_equal);
1435
1436 /* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */
1437 uint64_t buffer_mem_stream0_size =
1438 sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]);
1439 stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size);
1440 nir_store_var(&b, stream_offset, stream_offset_value, 0x1);
1441
1442 /* } while(count--); */
1443 nir_def *loop_count = nir_load_var(&b, count);
1444 loop_count = nir_iadd(&b, loop_count, minus_one);
1445 nir_store_var(&b, count, loop_count, 0x1);
1446
1447 nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); {
1448 nir_jump(&b, nir_jump_break);
1449 }
1450 nir_pop_if(&b, if_zero);
1451 }
1452 nir_pop_loop(&b, loop_inner); /* Inner loop end */
1453 }
1454 nir_pop_if(&b, if_overflow);
1455
1456 /* base_offset += sizeof(gfx11_sh_query_buffer_mem); */
1457 nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem));
1458 nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1);
1459 }
1460 nir_pop_loop(&b, loop_outer); /* Outer loop end */
1461
1462 nir_def *acc_result_value = nir_load_var(&b, acc_result);
1463 nir_def *y_value = nir_load_var(&b, acc_missing);
1464
1465 /* if ((chain & 2)) {
1466 * store_ssbo(<acc_result, acc_missing>, 2, 0);
1467 * ...
1468 */
1469 nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two));
1470 nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); {
1471 nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero);
1472 } nir_push_else(&b, if_write_summary_buffer); {
1473
1474 /* } else {
1475 * if ((config & 7) == 1) {
1476 * acc_result = acc_missing ? 0 : 1;
1477 * acc_missing = 0;
1478 * }
1479 * ...
1480 */
1481 nir_def *is_result_available = nir_ieq(&b, mode, one);
1482 nir_def *is_zero = nir_ieq(&b, y_value, zero);
1483 acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value);
1484 nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value);
1485
1486 /* if (!acc_missing) {
1487 * store_ssbo(acc_result, 2, 0);
1488 * if (config & 8)) {
1489 * store_ssbo(0, 2, 4)
1490 * }
1491 * }
1492 */
1493 nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); {
1494 nir_store_ssbo(&b, acc_result_value, two, zero);
1495
1496 nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8));
1497 nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); {
1498 nir_store_ssbo(&b, zero, two, four);
1499 }
1500 nir_pop_if(&b, if_so_any_overflow);
1501 }
1502 nir_pop_if(&b, if_zero);
1503 }
1504 nir_pop_if(&b, if_write_summary_buffer);
1505
1506 return create_shader_state(sctx, b.shader);
1507 }
1508