• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "gallium/auxiliary/nir/pipe_nir.h"
8 #define AC_SURFACE_INCLUDE_NIR
9 #include "ac_surface.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 
13 #include "nir_format_convert.h"
14 
create_shader_state(struct si_context * sctx,nir_shader * nir)15 static void *create_shader_state(struct si_context *sctx, nir_shader *nir)
16 {
17    sctx->b.screen->finalize_nir(sctx->b.screen, (void*)nir);
18    return pipe_shader_from_nir(&sctx->b, nir);
19 }
20 
get_global_ids(nir_builder * b,unsigned num_components)21 static nir_def *get_global_ids(nir_builder *b, unsigned num_components)
22 {
23    unsigned mask = BITFIELD_MASK(num_components);
24 
25    nir_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask);
26    nir_def *block_ids = nir_channels(b, nir_load_workgroup_id(b), mask);
27    nir_def *block_size = nir_channels(b, nir_load_workgroup_size(b), mask);
28    return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids);
29 }
30 
31 /* unpack_2x16(src, x, y): x = src & 0xffff; y = src >> 16; */
unpack_2x16(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)32 static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
33 {
34    *x = nir_iand_imm(b, src, 0xffff);
35    *y = nir_ushr_imm(b, src, 16);
36 }
37 
38 /* unpack_2x16_signed(src, x, y): x = (int32_t)((uint16_t)src); y = src >> 16; */
unpack_2x16_signed(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)39 static void unpack_2x16_signed(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
40 {
41    *x = nir_i2i32(b, nir_u2u16(b, src));
42    *y = nir_ishr_imm(b, src, 16);
43 }
44 
45 static nir_def *
deref_ssa(nir_builder * b,nir_variable * var)46 deref_ssa(nir_builder *b, nir_variable *var)
47 {
48    return &nir_build_deref_var(b, var)->def;
49 }
50 
51 /* Create a NIR compute shader implementing copy_image.
52  *
53  * This shader can handle 1D and 2D, linear and non-linear images.
54  * It expects the source and destination (x,y,z) coords as user_data_amd,
55  * packed into 3 SGPRs as 2x16bits per component.
56  */
si_create_copy_image_cs(struct si_context * sctx,unsigned wg_dim,bool src_is_1d_array,bool dst_is_1d_array)57 void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
58                               bool src_is_1d_array, bool dst_is_1d_array)
59 {
60    const nir_shader_compiler_options *options =
61       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
62 
63    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_cs");
64    b.shader->info.num_images = 2;
65 
66    /* The workgroup size is either 8x8 for normal (non-linear) 2D images,
67     * or 64x1 for 1D and linear-2D images.
68     */
69    b.shader->info.workgroup_size_variable = true;
70 
71    b.shader->info.cs.user_data_components_amd = 3;
72    nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
73 
74    nir_def *coord_src = NULL, *coord_dst = NULL;
75    unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
76                &coord_src, &coord_dst);
77 
78    coord_src = nir_iadd(&b, coord_src, ids);
79    coord_dst = nir_iadd(&b, coord_dst, ids);
80 
81    /* Coordinates must have 4 channels in NIR. */
82    coord_src = nir_pad_vector(&b, coord_src, 4);
83    coord_dst = nir_pad_vector(&b, coord_dst, 4);
84 
85    static unsigned swizzle_xz[] = {0, 2, 0, 0};
86 
87    if (src_is_1d_array)
88       coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
89    if (dst_is_1d_array)
90       coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
91 
92    const struct glsl_type *src_img_type = glsl_image_type(src_is_1d_array ? GLSL_SAMPLER_DIM_1D
93                                                                           : GLSL_SAMPLER_DIM_2D,
94                                                           /*is_array*/ true, GLSL_TYPE_FLOAT);
95    const struct glsl_type *dst_img_type = glsl_image_type(dst_is_1d_array ? GLSL_SAMPLER_DIM_1D
96                                                                           : GLSL_SAMPLER_DIM_2D,
97                                                           /*is_array*/ true, GLSL_TYPE_FLOAT);
98 
99    nir_variable *img_src = nir_variable_create(b.shader, nir_var_image, src_img_type, "img_src");
100    img_src->data.binding = 0;
101 
102    nir_variable *img_dst = nir_variable_create(b.shader, nir_var_image, dst_img_type, "img_dst");
103    img_dst->data.binding = 1;
104 
105    nir_def *undef32 = nir_undef(&b, 1, 32);
106    nir_def *zero = nir_imm_int(&b, 0);
107 
108    nir_def *data = nir_image_deref_load(&b, /*num_components*/ 4, /*bit_size*/ 32,
109       deref_ssa(&b, img_src), coord_src, undef32, zero);
110 
111    nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, undef32, data, zero);
112 
113    return create_shader_state(sctx, b.shader);
114 }
115 
si_create_dcc_retile_cs(struct si_context * sctx,struct radeon_surf * surf)116 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
117 {
118    const nir_shader_compiler_options *options =
119       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
120 
121    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "dcc_retile");
122    b.shader->info.workgroup_size[0] = 8;
123    b.shader->info.workgroup_size[1] = 8;
124    b.shader->info.workgroup_size[2] = 1;
125    b.shader->info.cs.user_data_components_amd = 3;
126    b.shader->info.num_ssbos = 1;
127 
128    /* Get user data SGPRs. */
129    nir_def *user_sgprs = nir_load_user_data_amd(&b);
130 
131    /* Relative offset from the displayable DCC to the non-displayable DCC in the same buffer. */
132    nir_def *src_dcc_offset = nir_channel(&b, user_sgprs, 0);
133 
134    nir_def *src_dcc_pitch, *dst_dcc_pitch, *src_dcc_height, *dst_dcc_height;
135    unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &src_dcc_pitch, &src_dcc_height);
136    unpack_2x16(&b, nir_channel(&b, user_sgprs, 2), &dst_dcc_pitch, &dst_dcc_height);
137 
138    /* Get the 2D coordinates. */
139    nir_def *coord = get_global_ids(&b, 2);
140    nir_def *zero = nir_imm_int(&b, 0);
141 
142    /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
143    coord = nir_imul(&b, coord, nir_imm_ivec2(&b, surf->u.gfx9.color.dcc_block_width,
144                                              surf->u.gfx9.color.dcc_block_height));
145 
146    nir_def *src_offset =
147       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.dcc_equation,
148                                  src_dcc_pitch, src_dcc_height, zero, /* DCC slice size */
149                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
150                                  zero, zero, zero); /* z, sample, pipe_xor */
151    src_offset = nir_iadd(&b, src_offset, src_dcc_offset);
152    nir_def *value = nir_load_ssbo(&b, 1, 8, zero, src_offset, .align_mul=1);
153 
154    nir_def *dst_offset =
155       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.display_dcc_equation,
156                                  dst_dcc_pitch, dst_dcc_height, zero, /* DCC slice size */
157                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
158                                  zero, zero, zero); /* z, sample, pipe_xor */
159    nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1);
160 
161    return create_shader_state(sctx, b.shader);
162 }
163 
gfx9_create_clear_dcc_msaa_cs(struct si_context * sctx,struct si_texture * tex)164 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
165 {
166    const nir_shader_compiler_options *options =
167       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
168 
169    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa");
170    b.shader->info.workgroup_size[0] = 8;
171    b.shader->info.workgroup_size[1] = 8;
172    b.shader->info.workgroup_size[2] = 1;
173    b.shader->info.cs.user_data_components_amd = 2;
174    b.shader->info.num_ssbos = 1;
175 
176    /* Get user data SGPRs. */
177    nir_def *user_sgprs = nir_load_user_data_amd(&b);
178    nir_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
179    unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
180    unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
181    clear_value = nir_u2u16(&b, clear_value);
182 
183    /* Get the 2D coordinates. */
184    nir_def *coord = get_global_ids(&b, 3);
185    nir_def *zero = nir_imm_int(&b, 0);
186 
187    /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
188    coord = nir_imul(&b, coord,
189                     nir_imm_ivec3(&b, tex->surface.u.gfx9.color.dcc_block_width,
190                                       tex->surface.u.gfx9.color.dcc_block_height,
191                                       tex->surface.u.gfx9.color.dcc_block_depth));
192 
193    nir_def *offset =
194       ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
195                                  &tex->surface.u.gfx9.color.dcc_equation,
196                                  dcc_pitch, dcc_height, zero, /* DCC slice size */
197                                  nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
198                                  tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
199                                  zero, pipe_xor); /* sample, pipe_xor */
200 
201    /* The trick here is that DCC elements for an even and the next odd sample are next to each other
202     * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
203     * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
204     */
205    nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
206 
207    return create_shader_state(sctx, b.shader);
208 }
209 
210 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_clear_buffer_rmw_cs(struct si_context * sctx)211 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
212 {
213    const nir_shader_compiler_options *options =
214       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
215 
216    nir_builder b =
217       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_buffer_rmw_cs");
218    b.shader->info.workgroup_size[0] = 64;
219    b.shader->info.workgroup_size[1] = 1;
220    b.shader->info.workgroup_size[2] = 1;
221    b.shader->info.cs.user_data_components_amd = 2;
222    b.shader->info.num_ssbos = 1;
223 
224    /* address = blockID * 64 + threadID; */
225    nir_def *address = get_global_ids(&b, 1);
226 
227    /* address = address * 16; (byte offset, loading one vec4 per thread) */
228    address = nir_ishl_imm(&b, address, 4);
229 
230    nir_def *zero = nir_imm_int(&b, 0);
231    nir_def *data = nir_load_ssbo(&b, 4, 32, zero, address, .align_mul = 4);
232 
233    /* Get user data SGPRs. */
234    nir_def *user_sgprs = nir_load_user_data_amd(&b);
235 
236    /* data &= inverted_writemask; */
237    data = nir_iand(&b, data, nir_channel(&b, user_sgprs, 1));
238    /* data |= clear_value_masked; */
239    data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
240 
241    nir_store_ssbo(&b, data, zero, address,
242       .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
243       .align_mul = 4);
244 
245    return create_shader_state(sctx, b.shader);
246 }
247 
248 /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
249  * VS passes its outputs to TES directly, so the fixed-function shader only
250  * has to write TESSOUTER and TESSINNER.
251  */
si_create_passthrough_tcs(struct si_context * sctx)252 void *si_create_passthrough_tcs(struct si_context *sctx)
253 {
254    const nir_shader_compiler_options *options =
255       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR,
256                                            PIPE_SHADER_TESS_CTRL);
257 
258    unsigned locations[PIPE_MAX_SHADER_OUTPUTS];
259 
260    struct si_shader_info *info = &sctx->shader.vs.cso->info;
261    for (unsigned i = 0; i < info->num_outputs; i++) {
262       locations[i] = info->output_semantic[i];
263    }
264 
265    nir_shader *tcs =
266          nir_create_passthrough_tcs_impl(options, locations, info->num_outputs,
267                                          sctx->patch_vertices);
268 
269    return create_shader_state(sctx, tcs);
270 }
271 
convert_linear_to_srgb(nir_builder * b,nir_def * input)272 static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input)
273 {
274    /* There are small precision differences compared to CB, so the gfx blit will return slightly
275     * different results.
276     */
277 
278    nir_def *comp[4];
279    for (unsigned i = 0; i < 3; i++)
280       comp[i] = nir_format_linear_to_srgb(b, nir_channel(b, input, i));
281    comp[3] = nir_channel(b, input, 3);
282 
283    return nir_vec(b, comp, 4);
284 }
285 
average_samples(nir_builder * b,nir_def ** samples,unsigned num_samples)286 static nir_def *average_samples(nir_builder *b, nir_def **samples, unsigned num_samples)
287 {
288    /* This works like add-reduce by computing the sum of each pair independently, and then
289     * computing the sum of each pair of sums, and so on, to get better instruction-level
290     * parallelism.
291     */
292    if (num_samples == 16) {
293       for (unsigned i = 0; i < 8; i++)
294          samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
295    }
296    if (num_samples >= 8) {
297       for (unsigned i = 0; i < 4; i++)
298          samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
299    }
300    if (num_samples >= 4) {
301       for (unsigned i = 0; i < 2; i++)
302          samples[i] = nir_fadd(b, samples[i * 2], samples[i * 2 + 1]);
303    }
304    if (num_samples >= 2)
305       samples[0] = nir_fadd(b, samples[0], samples[1]);
306 
307    return nir_fmul_imm(b, samples[0], 1.0 / num_samples); /* average the sum */
308 }
309 
image_resolve_msaa(struct si_screen * sscreen,nir_builder * b,nir_variable * img,unsigned num_samples,nir_def * coord)310 static nir_def *image_resolve_msaa(struct si_screen *sscreen, nir_builder *b, nir_variable *img,
311                                    unsigned num_samples, nir_def *coord)
312 {
313    nir_def *zero = nir_imm_int(b, 0);
314    nir_def *result = NULL;
315    nir_variable *var = NULL;
316 
317    /* Gfx11 doesn't support samples_identical, so we can't use it. */
318    if (sscreen->info.gfx_level < GFX11) {
319       /* We need a local variable to get the result out of conditional branches in SSA. */
320       var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
321 
322       /* If all samples are identical, load only sample 0. */
323       nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
324       result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
325       nir_store_var(b, var, result, 0xf);
326 
327       nir_push_else(b, NULL);
328    }
329 
330    nir_def *sample_index[16];
331    for (unsigned i = 0; i < num_samples; i++)
332       sample_index[i] = nir_imm_int(b, i);
333 
334    /* We need to hide the constant sample indices behind the optimization barrier, otherwise
335     * LLVM doesn't put loads into the same clause.
336     *
337     * TODO: nir_group_loads could do this.
338     */
339    if (!sscreen->use_aco) {
340       for (unsigned i = 0; i < num_samples; i++)
341          sample_index[i] = nir_optimization_barrier_vgpr_amd(b, 32, sample_index[i]);
342    }
343 
344    /* Load all samples. */
345    nir_def *samples[16];
346    for (unsigned i = 0; i < num_samples; i++) {
347       samples[i] = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
348                                         coord, sample_index[i], zero);
349    }
350 
351    result = average_samples(b, samples, num_samples);
352 
353    if (sscreen->info.gfx_level < GFX11) {
354       /* Exit the conditional branch and get the result out of the branch. */
355       nir_store_var(b, var, result, 0xf);
356       nir_pop_if(b, NULL);
357       result = nir_load_var(b, var);
358    }
359 
360    return result;
361 }
362 
apply_blit_output_modifiers(nir_builder * b,nir_def * color,const union si_compute_blit_shader_key * options)363 static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color,
364                                                 const union si_compute_blit_shader_key *options)
365 {
366    if (options->sint_to_uint)
367       color = nir_imax(b, color, nir_imm_int(b, 0));
368 
369    if (options->uint_to_sint)
370       color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
371 
372    if (options->dst_is_srgb)
373       color = convert_linear_to_srgb(b, color);
374 
375    nir_def *zero = nir_imm_int(b, 0);
376    nir_def *one = options->use_integer_one ? nir_imm_int(b, 1) : nir_imm_float(b, 1);
377 
378    /* Set channels not present in src to 0 or 1. This will eliminate code loading and resolving
379     * those channels.
380     */
381    for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++)
382       color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan);
383 
384    /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
385    if (options->last_dst_channel < 3)
386       color = nir_trim_vector(b, color, options->last_dst_channel + 1);
387 
388    /* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify
389     * the behavior of the whole shader by comparing it to the gfx blit.
390     */
391    if (options->fp16_rtz)
392       color = nir_f2f16_rtz(b, color);
393 
394    return color;
395 }
396 
397 /* The compute blit shader.
398  *
399  * Differences compared to u_blitter (the gfx blit):
400  * - u_blitter doesn't preserve NaNs, but the compute blit does
401  * - u_blitter has lower linear->SRGB precision because the CB block doesn't
402  *   use FP32, but the compute blit does.
403  *
404  * Other than that, non-scaled blits are identical to u_blitter.
405  *
406  * Implementation details:
407  * - Out-of-bounds dst coordinates are not clamped at all. The hw drops
408  *   out-of-bounds stores for us.
409  * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
410  *   the image_size NIR intrinsic.
411  * - X/Y flipping just does this in the shader: -threadIDs - 1
412  * - MSAA copies are implemented but disabled because MSAA image stores don't
413  *   work.
414  */
si_create_blit_cs(struct si_context * sctx,const union si_compute_blit_shader_key * options)415 void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options)
416 {
417    const nir_shader_compiler_options *nir_options =
418       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
419 
420    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
421                                                   "blit_non_scaled_cs");
422    b.shader->info.num_images = 2;
423    if (options->src_is_msaa)
424       BITSET_SET(b.shader->info.msaa_images, 0);
425    if (options->dst_is_msaa)
426       BITSET_SET(b.shader->info.msaa_images, 1);
427    /* TODO: 1D blits are 8x slower because the workgroup size is 8x8 */
428    b.shader->info.workgroup_size[0] = 8;
429    b.shader->info.workgroup_size[1] = 8;
430    b.shader->info.workgroup_size[2] = 1;
431    b.shader->info.cs.user_data_components_amd = 3;
432 
433    const struct glsl_type *img_type[2] = {
434       glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D :
435                       options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
436                       /*is_array*/ true, GLSL_TYPE_FLOAT),
437       glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
438                       options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
439                       /*is_array*/ true, GLSL_TYPE_FLOAT),
440    };
441 
442    nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
443    img_src->data.binding = 0;
444 
445    nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
446    img_dst->data.binding = 1;
447 
448    nir_def *zero = nir_imm_int(&b, 0);
449 
450    /* Instructions. */
451    /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
452    nir_def *dst_xyz = nir_pad_vector_imm_int(&b, get_global_ids(&b, options->wg_dim), 0, 3);
453    nir_def *src_xyz = dst_xyz;
454 
455    /* Flip src coordinates. */
456    for (unsigned i = 0; i < 2; i++) {
457       if (i ? options->flip_y : options->flip_x) {
458          /* x goes from 0 to (dim - 1).
459           * The flipped blit should load from -dim to -1.
460           * Therefore do: x = -x - 1;
461           */
462          nir_def *comp = nir_channel(&b, src_xyz, i);
463          comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
464          src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
465       }
466    }
467 
468    /* Add box.xyz. */
469    nir_def *coord_src = NULL, *coord_dst = NULL;
470    unpack_2x16_signed(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
471                       &coord_src, &coord_dst);
472    coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
473    coord_src = nir_iadd(&b, coord_src, src_xyz);
474 
475    /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
476    if (options->xy_clamp_to_edge) {
477       unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3;
478       nir_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
479       dim = nir_channels(&b, dim, src_clamp_channels);
480 
481       nir_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels);
482       coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0));
483       coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1));
484 
485       for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++)
486          coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i);
487    }
488 
489    /* Swizzle coordinates for 1D_ARRAY. */
490    static unsigned swizzle_xz[] = {0, 2, 0, 0};
491 
492    if (options->src_is_1d)
493       coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
494    if (options->dst_is_1d)
495       coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
496 
497    /* Coordinates must have 4 channels in NIR. */
498    coord_src = nir_pad_vector(&b, coord_src, 4);
499    coord_dst = nir_pad_vector(&b, coord_dst, 4);
500 
501    /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
502 
503    /* Execute the image loads and stores. */
504    unsigned num_samples = 1 << options->log2_samples;
505    nir_def *color;
506 
507    if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
508       /* MSAA resolving (downsampling). */
509       assert(num_samples > 1);
510       color = image_resolve_msaa(sctx->screen, &b, img_src, num_samples, coord_src);
511       color = apply_blit_output_modifiers(&b, color, options);
512       nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
513 
514    } else if (options->src_is_msaa && options->dst_is_msaa) {
515       /* MSAA copy. */
516       nir_def *color[16];
517       assert(num_samples > 1);
518       /* Group loads together and then stores. */
519       for (unsigned i = 0; i < num_samples; i++) {
520          color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
521                                          nir_imm_int(&b, i), zero);
522       }
523       for (unsigned i = 0; i < num_samples; i++)
524          color[i] = apply_blit_output_modifiers(&b, color[i], options);
525       for (unsigned i = 0; i < num_samples; i++) {
526          nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
527                                nir_imm_int(&b, i), color[i], zero);
528       }
529    } else if (!options->src_is_msaa && options->dst_is_msaa) {
530       /* MSAA upsampling. */
531       assert(num_samples > 1);
532       color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
533       color = apply_blit_output_modifiers(&b, color, options);
534       for (unsigned i = 0; i < num_samples; i++) {
535          nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
536                                nir_imm_int(&b, i), color, zero);
537       }
538    } else {
539       /* Non-MSAA copy or read sample 0 only. */
540       /* src2 = sample_index (zero), src3 = lod (zero) */
541       assert(num_samples == 1);
542       color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
543       color = apply_blit_output_modifiers(&b, color, options);
544       nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
545    }
546 
547    return create_shader_state(sctx, b.shader);
548 }
549 
si_clear_render_target_shader(struct si_context * sctx,enum pipe_texture_target type)550 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type)
551 {
552    nir_def *address;
553    enum glsl_sampler_dim sampler_type;
554 
555    const nir_shader_compiler_options *options =
556       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
557 
558    nir_builder b =
559    nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_render_target");
560    b.shader->info.num_ubos = 1;
561    b.shader->info.num_images = 1;
562    b.shader->num_uniforms = 2;
563 
564    switch (type) {
565       case PIPE_TEXTURE_1D_ARRAY:
566          b.shader->info.workgroup_size[0] = 64;
567          b.shader->info.workgroup_size[1] = 1;
568          b.shader->info.workgroup_size[2] = 1;
569          sampler_type = GLSL_SAMPLER_DIM_1D;
570          address = get_global_ids(&b, 2);
571          break;
572       case PIPE_TEXTURE_2D_ARRAY:
573          b.shader->info.workgroup_size[0] = 8;
574          b.shader->info.workgroup_size[1] = 8;
575          b.shader->info.workgroup_size[2] = 1;
576          sampler_type = GLSL_SAMPLER_DIM_2D;
577          address = get_global_ids(&b, 3);
578          break;
579       default:
580          unreachable("unsupported texture target type");
581    }
582 
583    const struct glsl_type *img_type = glsl_image_type(sampler_type, true, GLSL_TYPE_FLOAT);
584    nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
585    output_img->data.image.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
586 
587    nir_def *zero = nir_imm_int(&b, 0);
588    nir_def *ubo = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
589 
590    /* TODO: No GL CTS tests for 1D arrays, relying on OpenCL CTS for now.
591     * As a sanity check, "OpenCL-CTS/test_conformance/images/clFillImage" tests should pass
592     */
593    if (type == PIPE_TEXTURE_1D_ARRAY) {
594       unsigned swizzle[4] = {0, 2, 0, 0};
595       ubo = nir_swizzle(&b, ubo, swizzle, 4);
596    }
597 
598    address = nir_iadd(&b, address, ubo);
599    nir_def *coord = nir_pad_vector(&b, address, 4);
600 
601    nir_def *data = nir_load_ubo(&b, 4, 32, zero, nir_imm_int(&b, 16), .range_base = 16, .range = 16);
602 
603    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, zero, data, zero,
604                          .image_dim = sampler_type, .image_array = true);
605 
606    return create_shader_state(sctx, b.shader);
607 }
608 
si_clear_12bytes_buffer_shader(struct si_context * sctx)609 void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
610 {
611    const nir_shader_compiler_options *options =
612    sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
613 
614    nir_builder b =
615    nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_12bytes_buffer");
616    b.shader->info.workgroup_size[0] = 64;
617    b.shader->info.workgroup_size[1] = 1;
618    b.shader->info.workgroup_size[2] = 1;
619    b.shader->info.cs.user_data_components_amd = 3;
620 
621    nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
622    nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
623 
624    nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
625       .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
626 
627    return create_shader_state(sctx, b.shader);
628 }
629 
si_create_ubyte_to_ushort_compute_shader(struct si_context * sctx)630 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
631 {
632    const nir_shader_compiler_options *options =
633       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
634 
635    unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
636 
637    /* Don't cache loads, because there is no reuse. */
638    unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
639 
640    nir_builder b =
641       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort");
642 
643    unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
644 
645    b.shader->info.workgroup_size[0] = default_wave_size;
646    b.shader->info.workgroup_size[1] = 1;
647    b.shader->info.workgroup_size[2] = 1;
648    b.shader->info.num_ssbos = 2;
649 
650    nir_def *load_address = get_global_ids(&b, 1);
651    nir_def *store_address = nir_imul_imm(&b, load_address, 2);
652 
653    nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
654                                         load_address, .access = load_qualifier);
655    nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
656                   store_address, .access = store_qualifier);
657 
658    return create_shader_state(sctx, b.shader);
659 }
660 
661 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_dma_compute_shader(struct si_context * sctx,unsigned num_dwords_per_thread,bool dst_stream_cache_policy,bool is_copy)662 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
663                                    bool dst_stream_cache_policy, bool is_copy)
664 {
665    assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
666 
667    const nir_shader_compiler_options *options =
668       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
669 
670    unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
671    if (dst_stream_cache_policy)
672       store_qualifier |= ACCESS_NON_TEMPORAL;
673 
674    /* Don't cache loads, because there is no reuse. */
675    unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
676 
677    nir_builder b =
678       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
679 
680    unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
681 
682    b.shader->info.workgroup_size[0] = default_wave_size;
683    b.shader->info.workgroup_size[1] = 1;
684    b.shader->info.workgroup_size[2] = 1;
685    b.shader->info.num_ssbos = 1;
686 
687    unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
688    unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
689 
690    for (unsigned i = 0; i < num_mem_ops; i++) {
691       if (i * 4 < num_dwords_per_thread)
692          inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
693    }
694 
695    /* If there are multiple stores,
696     * the first store writes into 0 * wavesize + tid,
697     * the 2nd store writes into 1 * wavesize + tid,
698     * the 3rd store writes into 2 * wavesize + tid, etc.
699     */
700    nir_def *store_address = get_global_ids(&b, 1);
701 
702    /* Convert from a "store size unit" into bytes. */
703    store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
704 
705    nir_def *load_address = store_address, *value, *values[num_mem_ops];
706    value = nir_undef(&b, 1, 32);
707 
708    if (is_copy) {
709       b.shader->info.num_ssbos++;
710    } else {
711       b.shader->info.cs.user_data_components_amd = inst_dwords[0];
712       value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
713    }
714 
715    /* Distance between a load and a store for latency hiding. */
716    unsigned load_store_distance = is_copy ? 8 : 0;
717 
718    for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
719       int d = i - load_store_distance;
720 
721       if (is_copy && i < num_mem_ops) {
722          if (i) {
723             load_address = nir_iadd(&b, load_address,
724                                     nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
725          }
726          values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
727                                    .access = load_qualifier);
728       }
729 
730       if (d >= 0) {
731          if (d) {
732             store_address = nir_iadd(&b, store_address,
733                                      nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
734          }
735          nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
736                         .access = store_qualifier);
737       }
738    }
739 
740    return create_shader_state(sctx, b.shader);
741 }
742 
743 /* Load samples from the image, and copy them to the same image. This looks like
744  * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
745  * reordered to match expanded FMASK.
746  *
747  * After the shader finishes, FMASK should be cleared to identity.
748  */
si_create_fmask_expand_cs(struct si_context * sctx,unsigned num_samples,bool is_array)749 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array)
750 {
751    const nir_shader_compiler_options *options =
752       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
753 
754    nir_builder b =
755       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_fmask_expand_cs");
756    b.shader->info.workgroup_size[0] = 8;
757    b.shader->info.workgroup_size[1] = 8;
758    b.shader->info.workgroup_size[2] = 1;
759 
760    /* Return an empty compute shader */
761    if (num_samples == 0)
762       return create_shader_state(sctx, b.shader);
763 
764    b.shader->info.num_images = 1;
765 
766    const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_MS, is_array, GLSL_TYPE_FLOAT);
767    nir_variable *img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
768    img->data.access = ACCESS_RESTRICT;
769 
770    nir_def *z = nir_undef(&b, 1, 32);
771    if (is_array) {
772       z = nir_channel(&b, nir_load_workgroup_id(&b), 2);
773    }
774 
775    nir_def *zero = nir_imm_int(&b, 0);
776    nir_def *address = get_global_ids(&b, 2);
777 
778    nir_def *sample[8], *addresses[8];
779    assert(num_samples <= ARRAY_SIZE(sample));
780 
781    nir_def *img_def = &nir_build_deref_var(&b, img)->def;
782 
783    /* Load samples, resolving FMASK. */
784    for (unsigned i = 0; i < num_samples; i++) {
785       nir_def *it = nir_imm_int(&b, i);
786       sample[i] = nir_vec4(&b, nir_channel(&b, address, 0), nir_channel(&b, address, 1), z, it);
787       addresses[i] = nir_image_deref_load(&b, 4, 32, img_def, sample[i], it, zero,
788                                           .access = ACCESS_RESTRICT,
789                                           .image_dim = GLSL_SAMPLER_DIM_2D,
790                                           .image_array = is_array);
791    }
792 
793    /* Store samples, ignoring FMASK. */
794    for (unsigned i = 0; i < num_samples; i++) {
795       nir_image_deref_store(&b, img_def, sample[i], nir_imm_int(&b, i), addresses[i], zero,
796                             .access = ACCESS_RESTRICT,
797                             .image_dim = GLSL_SAMPLER_DIM_2D,
798                             .image_array = is_array);
799    }
800 
801    return create_shader_state(sctx, b.shader);
802 }
803 
804 /* This is just a pass-through shader with 1-3 MOV instructions. */
si_get_blitter_vs(struct si_context * sctx,enum blitter_attrib_type type,unsigned num_layers)805 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
806 {
807    unsigned vs_blit_property;
808    void **vs;
809 
810    switch (type) {
811    case UTIL_BLITTER_ATTRIB_NONE:
812       vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
813       vs_blit_property = SI_VS_BLIT_SGPRS_POS;
814       break;
815    case UTIL_BLITTER_ATTRIB_COLOR:
816       vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
817       vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
818       break;
819    case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
820    case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
821       assert(num_layers == 1);
822       vs = &sctx->vs_blit_texcoord;
823       vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
824       break;
825    default:
826       assert(0);
827       return NULL;
828    }
829 
830    if (*vs)
831       return *vs;
832 
833    /* Add 1 for the attribute ring address. */
834    if (sctx->gfx_level >= GFX11 && type != UTIL_BLITTER_ATTRIB_NONE)
835       vs_blit_property++;
836 
837    const nir_shader_compiler_options *options =
838       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_VERTEX);
839 
840    nir_builder b =
841       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options, "get_blitter_vs");
842 
843    /* Tell the shader to load VS inputs from SGPRs: */
844    b.shader->info.vs.blit_sgprs_amd = vs_blit_property;
845    b.shader->info.vs.window_space_position = true;
846 
847    const struct glsl_type *vec4 = glsl_vec4_type();
848 
849    nir_copy_var(&b,
850                 nir_create_variable_with_location(b.shader, nir_var_shader_out,
851                                                   VARYING_SLOT_POS, vec4),
852                 nir_create_variable_with_location(b.shader, nir_var_shader_in,
853                                                   VERT_ATTRIB_GENERIC0, vec4));
854 
855    if (type != UTIL_BLITTER_ATTRIB_NONE) {
856       nir_copy_var(&b,
857                    nir_create_variable_with_location(b.shader, nir_var_shader_out,
858                                                      VARYING_SLOT_VAR0, vec4),
859                    nir_create_variable_with_location(b.shader, nir_var_shader_in,
860                                                      VERT_ATTRIB_GENERIC1, vec4));
861    }
862 
863    if (num_layers > 1) {
864       nir_variable *out_layer =
865          nir_create_variable_with_location(b.shader, nir_var_shader_out,
866                                            VARYING_SLOT_LAYER, glsl_int_type());
867       out_layer->data.interpolation = INTERP_MODE_NONE;
868 
869       nir_copy_var(&b, out_layer,
870                    nir_create_variable_with_location(b.shader, nir_var_system_value,
871                                                      SYSTEM_VALUE_INSTANCE_ID, glsl_int_type()));
872    }
873 
874    *vs = create_shader_state(sctx, b.shader);
875    return *vs;
876 }
877 
878 /* Create the compute shader that is used to collect the results.
879  *
880  * One compute grid with a single thread is launched for every query result
881  * buffer. The thread (optionally) reads a previous summary buffer, then
882  * accumulates data from the query result buffer, and writes the result either
883  * to a summary buffer to be consumed by the next grid invocation or to the
884  * user-supplied buffer.
885  *
886  * Data layout:
887  *
888  * CONST
889  *  0.x = end_offset
890  *  0.y = result_stride
891  *  0.z = result_count
892  *  0.w = bit field:
893  *          1: read previously accumulated values
894  *          2: write accumulated values for chaining
895  *          4: write result available
896  *          8: convert result to boolean (0/1)
897  *         16: only read one dword and use that as result
898  *         32: apply timestamp conversion
899  *         64: store full 64 bits result
900  *        128: store signed 32 bits result
901  *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
902  *  1.x = fence_offset
903  *  1.y = pair_stride
904  *  1.z = pair_count
905  *
906  */
si_create_query_result_cs(struct si_context * sctx)907 void *si_create_query_result_cs(struct si_context *sctx)
908 {
909    const nir_shader_compiler_options *options =
910       sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
911 
912    nir_builder b =
913       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs");
914    b.shader->info.workgroup_size[0] = 1;
915    b.shader->info.workgroup_size[1] = 1;
916    b.shader->info.workgroup_size[2] = 1;
917    b.shader->info.num_ubos = 1;
918    b.shader->info.num_ssbos = 3;
919    b.shader->num_uniforms = 2;
920 
921    nir_def *var_undef = nir_undef(&b, 1, 32);
922    nir_def *zero = nir_imm_int(&b, 0);
923    nir_def *one = nir_imm_int(&b, 1);
924    nir_def *two = nir_imm_int(&b, 2);
925    nir_def *four = nir_imm_int(&b, 4);
926    nir_def *eight = nir_imm_int(&b, 8);
927    nir_def *sixteen = nir_imm_int(&b, 16);
928    nir_def *thirty_one = nir_imm_int(&b, 31);
929    nir_def *sixty_four = nir_imm_int(&b, 64);
930 
931    /* uint32_t x, y, z = 0; */
932    nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
933    nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
934    nir_store_var(&b, x, var_undef, 0x1);
935    nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
936    nir_store_var(&b, y, var_undef, 0x1);
937    nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
938    nir_store_var(&b, z, zero, 0x1);
939 
940    /* uint32_t buff_0[4] = load_ubo(0, 0); */
941    nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
942    /* uint32_t buff_1[4] = load_ubo(1, 16); */
943    nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
944 
945    /* uint32_t b0_bitfield = buff_0.w; */
946    nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
947 
948    /* Check result availability.
949     *    if (b0_bitfield & (1u << 4)) {
950     *       ...
951     */
952    nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
953    nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
954 
955       /*   int32_t value = load_ssbo(0, fence_offset);
956        *   z = ~(value >> 31);
957        */
958       nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
959       nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
960       nir_store_var(&b, z, bitmask, 0x1);
961 
962       /* Load result if available.
963        *    if (value < 0) {
964        *       uint32_t result[2] = load_ssbo(0, 0);
965        *       x = result[0];
966        *       y = result[1];
967        *    }
968        */
969       nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
970          nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
971          nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
972          nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
973       }
974       nir_pop_if(&b, if_negative);
975    } nir_push_else(&b, if_one_dword_result); {
976 
977       /* } else {
978        *    x = 0; y = 0;
979        */
980       nir_store_var(&b, x, zero, 0x1);
981       nir_store_var(&b, y, zero, 0x1);
982 
983       /* Load previously accumulated result if requested.
984        *    if (b0_bitfield & (1u << 0)) {
985        *       uint32_t result[3] = load_ssbo(1, 0);
986        *       x = result[0];
987        *       y = result[1];
988        *       z = result[2];
989        *    }
990        */
991       nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
992       nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
993          nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
994          nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
995          nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
996          nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
997       }
998       nir_pop_if(&b, if_prev_acc_result);
999 
1000       /* if (!z) {
1001        *    uint32_t result_index = 0;
1002        *    uint32_t pitch = 0;
1003        *    ...
1004        */
1005       nir_def *z_value = nir_load_var(&b, z);
1006       nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
1007          nir_variable *outer_loop_iter =
1008             nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
1009          nir_store_var(&b, outer_loop_iter, zero, 0x1);
1010          nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
1011          nir_store_var(&b, pitch, zero, 0x1);
1012 
1013          /* Outer loop.
1014           *   while (result_index <= result_count) {
1015           *      ...
1016           */
1017          nir_loop *loop_outer = nir_push_loop(&b); {
1018             nir_def *result_index = nir_load_var(&b, outer_loop_iter);
1019             nir_def *is_result_index_out_of_bound =
1020                nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
1021             nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
1022                nir_jump(&b, nir_jump_break);
1023             }
1024             nir_pop_if(&b, if_out_of_bound);
1025 
1026             /* Load fence and check result availability.
1027              *    pitch = i * result_stride;
1028              *    uint32_t address = fence_offset + pitch;
1029              *    int32_t value = load_ssbo(0, address);
1030              *    z = ~(value >> 31);
1031              */
1032             nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
1033             nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
1034             nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
1035             nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
1036             nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
1037             nir_store_var(&b, z, bitmask, 0x1);
1038 
1039             /*    if (z) {
1040              *       break;
1041              *    }
1042              */
1043             nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
1044                nir_jump(&b, nir_jump_break);
1045             }
1046             nir_pop_if(&b, if_result_available);
1047 
1048             /* Inner loop iterator.
1049              *    uint32_t i = 0;
1050              */
1051             nir_variable *inner_loop_iter =
1052                nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
1053             nir_store_var(&b, inner_loop_iter, zero, 0x1);
1054 
1055             /* Inner loop.
1056              *    do {
1057              *       ...
1058              */
1059             nir_loop *loop_inner = nir_push_loop(&b); {
1060                nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
1061                nir_def *i = nir_load_var(&b, inner_loop_iter);
1062 
1063                /* Load start and end.
1064                 *    uint64_t first = load_ssbo(0, pitch);
1065                 *    uint64_t second = load_ssbo(0, pitch + end_offset);
1066                 *    uint64_t start_half_pair = second - first;
1067                 */
1068                nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
1069                nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
1070                nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
1071                nir_def *start_half_pair = nir_isub(&b, second, first);
1072 
1073                /* Load second start/end half-pair and take the difference.
1074                 *    if (b0_bitfield & (1u << 8)) {
1075                 *       uint64_t first = load_ssbo(0, pitch + 8);
1076                 *       uint64_t second = load_ssbo(0, pitch + end_offset + 8);
1077                 *       uint64_t end_half_pair = second - first;
1078                 *       uint64_t difference = start_half_pair - end_half_pair;
1079                 *    }
1080                 */
1081                nir_def *difference;
1082                nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
1083                nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
1084                   first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
1085                   second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
1086                   nir_def *end_half_pair = nir_isub(&b, second, first);
1087                   difference = nir_isub(&b, start_half_pair, end_half_pair);
1088                }
1089                nir_pop_if(&b, if_so_overflow_mode);
1090 
1091                /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
1092                nir_def *sum = nir_iadd(&b,
1093                                        nir_pack_64_2x32_split(&b,
1094                                                               nir_load_var(&b, x),
1095                                                               nir_load_var(&b, y)),
1096                                        nir_if_phi(&b, difference, start_half_pair));
1097                sum = nir_unpack_64_2x32(&b, sum);
1098 
1099                /* Increment inner loop iterator.
1100                 *    i++;
1101                 */
1102                i = nir_iadd(&b, i, one);
1103                nir_store_var(&b, inner_loop_iter, i, 0x1);
1104 
1105                /* Update pitch value.
1106                 *    pitch = i * pair_stride + pitch;
1107                 */
1108                nir_def *incremented_pitch = nir_iadd(&b,
1109                                              nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
1110                                              pitch_outer_loop);
1111                nir_store_var(&b, pitch, incremented_pitch, 0x1);
1112 
1113                /* Update x and y.
1114                 *    x = sum.x;
1115                 *    y = sum.x >> 32;
1116                 */
1117                nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
1118                nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
1119 
1120                /* } while (i < pair_count);
1121                */
1122                nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
1123                nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
1124                   nir_jump(&b, nir_jump_break);
1125                }
1126                nir_pop_if(&b, if_pair_count_exceeded);
1127             }
1128             nir_pop_loop(&b, loop_inner);
1129 
1130             /* Increment pair iterator.
1131              *    result_index++;
1132              */
1133             nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
1134          }
1135          nir_pop_loop(&b, loop_outer);
1136       }
1137       nir_pop_if(&b, if_not_z);
1138    }
1139    nir_pop_if(&b, if_one_dword_result);
1140 
1141    nir_def *x_value = nir_load_var(&b, x);
1142    nir_def *y_value = nir_load_var(&b, y);
1143    nir_def *z_value = nir_load_var(&b, z);
1144 
1145    /* Store accumulated data for chaining.
1146     *    if (b0_bitfield & (1u << 1)) {
1147     *       store_ssbo(<x, y, z>, 2, 0);
1148     */
1149    nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
1150    nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
1151       nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
1152    } nir_push_else(&b, if_acc_chaining); {
1153 
1154       /* Store result availability.
1155        *    } else {
1156        *       if (b0_bitfield & (1u << 2)) {
1157        *          store_ssbo((~z & 1), 2, 0);
1158        *          ...
1159        */
1160       nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
1161       nir_if *if_result_available = nir_push_if(&b, is_result_available); {
1162          nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
1163 
1164          /* Store full 64 bits result.
1165           *    if (b0_bitfield & (1u << 6)) {
1166           *       store_ssbo(<0, 0>, 2, 0);
1167           *    }
1168           */
1169          nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
1170          nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
1171             nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
1172                            .write_mask = (1u << 1));
1173          }
1174          nir_pop_if(&b, if_result_64_bits);
1175       } nir_push_else(&b, if_result_available); {
1176 
1177          /* } else {
1178           *    if (~z) {
1179           *       ...
1180           */
1181          nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
1182          nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
1183             nir_def *ts_x, *ts_y;
1184 
1185             /* Apply timestamp conversion.
1186              *    if (b0_bitfield & (1u << 5)) {
1187              *       uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
1188              *       uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
1189              *       x = ts_converted.x;
1190              *       y = ts_converted.x >> 32;
1191              *    }
1192              */
1193             nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
1194             nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
1195                /* Add the frequency into the shader for timestamp conversion
1196                 * so that the backend can use the full range of optimizations
1197                 * for divide-by-constant.
1198                 */
1199                nir_def *clock_crystal_frequency =
1200                   nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
1201 
1202                nir_def *xy_million = nir_imul(&b,
1203                                            nir_pack_64_2x32_split(&b, x_value, y_value),
1204                                            nir_imm_int64(&b, 1000000));
1205                nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
1206                ts_converted = nir_unpack_64_2x32(&b, ts_converted);
1207                ts_x = nir_channel(&b, ts_converted, 0);
1208                ts_y = nir_channel(&b, ts_converted, 1);
1209             }
1210             nir_pop_if(&b, if_apply_timestamp);
1211 
1212             nir_def *nx = nir_if_phi(&b, ts_x, x_value);
1213             nir_def *ny = nir_if_phi(&b, ts_y, y_value);
1214 
1215             /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
1216              * y = b0_bitfield & (1u << 3) ? 0 : y;
1217              */
1218             nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
1219             nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
1220             nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
1221             nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
1222             ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
1223 
1224             /* if (b0_bitfield & (1u << 6)) {
1225              *    store_ssbo(<x, y>, 2, 0);
1226              * }
1227              */
1228             nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
1229             nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
1230                nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
1231             } nir_push_else(&b, if_result_64_bits); {
1232 
1233                /* Clamping.
1234                 *    } else {
1235                 *       x = y ? UINT32_MAX : x;
1236                 *       x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
1237                 *       store_ssbo(x, 2, 0);
1238                 *    }
1239                 */
1240                nir_def *is_y = nir_ine(&b, ny, zero);
1241                nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
1242                nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
1243                nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
1244                nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
1245                nir_store_ssbo(&b, nx, two, zero);
1246             }
1247             nir_pop_if(&b, if_result_64_bits);
1248          }
1249          nir_pop_if(&b, if_bitwise_not_z);
1250       }
1251       nir_pop_if(&b, if_result_available);
1252    }
1253    nir_pop_if(&b, if_acc_chaining);
1254 
1255    return create_shader_state(sctx, b.shader);
1256 }
1257 
1258 /* Create the compute shader that is used to collect the results of gfx10+
1259  * shader queries.
1260  *
1261  * One compute grid with a single thread is launched for every query result
1262  * buffer. The thread (optionally) reads a previous summary buffer, then
1263  * accumulates data from the query result buffer, and writes the result either
1264  * to a summary buffer to be consumed by the next grid invocation or to the
1265  * user-supplied buffer.
1266  *
1267  * Data layout:
1268  *
1269  * CONST
1270  *  0.x = config;
1271  *          [0:2] the low 3 bits indicate the mode:
1272  *             0: sum up counts
1273  *             1: determine result availability and write it as a boolean
1274  *             2: SO_OVERFLOW
1275  *          3: SO_ANY_OVERFLOW
1276  *        the remaining bits form a bitfield:
1277  *          8: write result as a 64-bit value
1278  *  0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
1279  *  0.z = chain bit field:
1280  *          1: have previous summary buffer
1281  *          2: write next summary buffer
1282  *  0.w = result_count
1283  */
gfx11_create_sh_query_result_cs(struct si_context * sctx)1284 void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
1285 {
1286    const nir_shader_compiler_options *options =
1287    sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
1288 
1289    nir_builder b =
1290       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "gfx11_create_sh_query_result_cs");
1291    b.shader->info.workgroup_size[0] = 1;
1292    b.shader->info.workgroup_size[1] = 1;
1293    b.shader->info.workgroup_size[2] = 1;
1294    b.shader->info.num_ubos = 1;
1295    b.shader->info.num_ssbos = 3;
1296    b.shader->num_uniforms = 1;
1297 
1298    nir_def *zero = nir_imm_int(&b, 0);
1299    nir_def *one = nir_imm_int(&b, 1);
1300    nir_def *two = nir_imm_int(&b, 2);
1301    nir_def *four = nir_imm_int(&b, 4);
1302    nir_def *minus_one = nir_imm_int(&b, 0xffffffff);
1303 
1304    /* uint32_t acc_result = 0, acc_missing = 0; */
1305    nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
1306    nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result");
1307    nir_store_var(&b, acc_result, zero, 0x1);
1308    nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing");
1309    nir_store_var(&b, acc_missing, zero, 0x1);
1310 
1311    /* uint32_t buff_0[4] = load_ubo(0, 0); */
1312    nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
1313 
1314    /* if((chain & 1) {
1315     *    uint32_t result[2] = load_ssbo(1, 0);
1316     *    acc_result = result[0];
1317     *    acc_missing = result[1];
1318     * }
1319     */
1320    nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one));
1321    nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); {
1322       nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero);
1323          nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1);
1324          nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1);
1325    }
1326    nir_pop_if(&b, if_prev_summary_buffer);
1327 
1328    /* uint32_t mode = config & 0b111;
1329     * bool is_overflow = mode >= 2;
1330     */
1331    nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111);
1332    nir_def *is_overflow = nir_uge(&b, mode, two);
1333 
1334    /* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */
1335    nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining");
1336    nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset");
1337    nir_def *state = nir_iand(&b,
1338                              nir_isub(&b, zero, nir_b2i32(&b, is_overflow)),
1339                              nir_load_var(&b, acc_result));
1340    nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3));
1341    nir_store_var(&b, result_remaining, value, 0x1);
1342 
1343    /* uint32_t base_offset = 0; */
1344    nir_store_var(&b, base_offset, zero, 0x1);
1345 
1346    /* Outer loop begin.
1347     *   while (!result_remaining) {
1348     *      ...
1349     */
1350    nir_loop *loop_outer = nir_push_loop(&b); {
1351       nir_def *condition = nir_load_var(&b, result_remaining);
1352       nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); {
1353          nir_jump(&b, nir_jump_break);
1354       }
1355       nir_pop_if(&b, if_not_condition);
1356 
1357       /* result_remaining--; */
1358       condition = nir_iadd(&b, condition, minus_one);
1359       nir_store_var(&b, result_remaining, condition, 0x1);
1360 
1361       /* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */
1362       nir_def *b_offset = nir_load_var(&b, base_offset);
1363       uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream);
1364       nir_def *fence = nir_load_ssbo(&b, 1, 32, zero,
1365                                     nir_iadd_imm(&b, b_offset, buffer_mem_stream_size));
1366 
1367       /* if (!fence) {
1368        *    acc_missing = ~0u;
1369        *    break;
1370        * }
1371        */
1372       nir_def *is_zero = nir_ieq(&b, fence, zero);
1373       nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero));
1374       nir_store_var(&b, acc_missing, y_value, 0x1);
1375       nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); {
1376          nir_jump(&b, nir_jump_break);
1377       }
1378       nir_pop_if(&b, if_ssbo_zero);
1379 
1380       /* stream_offset = base_offset + offset; */
1381       nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1));
1382 
1383       /* if (!(config & 7)) {
1384        *    acc_result += buffer[0]@stream_offset;
1385        * }
1386        */
1387       nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); {
1388          nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset);
1389          x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value);
1390          nir_store_var(&b, acc_result, x_value, 0x1);
1391       }
1392       nir_pop_if(&b, if_sum_up_counts);
1393 
1394       /* if (is_overflow) {
1395        *    uint32_t count = (config & 1) ? 4 : 1;
1396        *    ...
1397        */
1398       nir_if *if_overflow = nir_push_if(&b, is_overflow); {
1399          nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one));
1400          nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one);
1401 
1402          nir_variable *count =
1403             nir_local_variable_create(e, glsl_uint_type(), "count");
1404          nir_store_var(&b, count, initial_count, 0x1);
1405 
1406          nir_variable *stream_offset =
1407             nir_local_variable_create(e, glsl_uint_type(), "stream_offset");
1408          nir_store_var(&b, stream_offset, s_offset, 0x1);
1409 
1410          /* Inner loop begin.
1411           *    do {
1412           *       ...
1413           */
1414          nir_loop *loop_inner = nir_push_loop(&b); {
1415             /* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */
1416             nir_def *stream_offset_value = nir_load_var(&b, stream_offset);
1417             nir_def *buffer =
1418                nir_load_ssbo(&b, 4, 32, zero,
1419                              nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t)));
1420 
1421             /* if (generated != emitted) {
1422              *    acc_result = 1;
1423              *    base_offset = 0;
1424              *    break;
1425              * }
1426              */
1427             nir_def *generated = nir_channel(&b, buffer, 0);
1428             nir_def *emitted = nir_channel(&b, buffer, 2);
1429             nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); {
1430                nir_store_var(&b, acc_result, one, 0x1);
1431                nir_store_var(&b, base_offset, zero, 0x1);
1432                nir_jump(&b, nir_jump_break);
1433             }
1434             nir_pop_if(&b, if_not_equal);
1435 
1436             /* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */
1437             uint64_t buffer_mem_stream0_size =
1438                sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]);
1439             stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size);
1440             nir_store_var(&b, stream_offset, stream_offset_value, 0x1);
1441 
1442             /* } while(count--); */
1443             nir_def *loop_count = nir_load_var(&b, count);
1444             loop_count = nir_iadd(&b, loop_count, minus_one);
1445             nir_store_var(&b, count, loop_count, 0x1);
1446 
1447             nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); {
1448                nir_jump(&b, nir_jump_break);
1449             }
1450             nir_pop_if(&b, if_zero);
1451          }
1452          nir_pop_loop(&b, loop_inner); /* Inner loop end */
1453       }
1454       nir_pop_if(&b, if_overflow);
1455 
1456       /* base_offset += sizeof(gfx11_sh_query_buffer_mem); */
1457       nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem));
1458       nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1);
1459    }
1460    nir_pop_loop(&b, loop_outer); /* Outer loop end */
1461 
1462    nir_def *acc_result_value = nir_load_var(&b, acc_result);
1463    nir_def *y_value = nir_load_var(&b, acc_missing);
1464 
1465    /* if ((chain & 2)) {
1466     *    store_ssbo(<acc_result, acc_missing>, 2, 0);
1467     *    ...
1468     */
1469    nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two));
1470    nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); {
1471       nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero);
1472    } nir_push_else(&b, if_write_summary_buffer); {
1473 
1474       /* } else {
1475        *    if ((config & 7) == 1) {
1476        *       acc_result = acc_missing ? 0 : 1;
1477        *       acc_missing = 0;
1478        *    }
1479        *    ...
1480        */
1481       nir_def *is_result_available = nir_ieq(&b, mode, one);
1482       nir_def *is_zero = nir_ieq(&b, y_value, zero);
1483       acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value);
1484       nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value);
1485 
1486       /* if (!acc_missing) {
1487        *    store_ssbo(acc_result, 2, 0);
1488        *    if (config & 8)) {
1489        *       store_ssbo(0, 2, 4)
1490        *    }
1491        * }
1492        */
1493       nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); {
1494          nir_store_ssbo(&b, acc_result_value, two, zero);
1495 
1496          nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8));
1497          nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); {
1498             nir_store_ssbo(&b, zero, two, four);
1499          }
1500          nir_pop_if(&b, if_so_any_overflow);
1501       }
1502       nir_pop_if(&b, if_zero);
1503    }
1504    nir_pop_if(&b, if_write_summary_buffer);
1505 
1506    return create_shader_state(sctx, b.shader);
1507 }
1508