1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "gallium/auxiliary/nir/pipe_nir.h"
8 #define AC_SURFACE_INCLUDE_NIR
9 #include "ac_surface.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "aco_interface.h"
13 #include "nir_format_convert.h"
14 #include "ac_nir_helpers.h"
15
si_create_shader_state(struct si_context * sctx,nir_shader * nir)16 void *si_create_shader_state(struct si_context *sctx, nir_shader *nir)
17 {
18 sctx->b.screen->finalize_nir(sctx->b.screen, nir);
19 return pipe_shader_from_nir(&sctx->b, nir);
20 }
21
22 /* unpack_2x16(src, x, y): x = src & 0xffff; y = src >> 16; */
unpack_2x16(nir_builder * b,nir_def * src,nir_def ** x,nir_def ** y)23 static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y)
24 {
25 *x = nir_iand_imm(b, src, 0xffff);
26 *y = nir_ushr_imm(b, src, 16);
27 }
28
si_create_dcc_retile_cs(struct si_context * sctx,struct radeon_surf * surf)29 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
30 {
31 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
32 "dcc_retile");
33 b.shader->info.workgroup_size[0] = 8;
34 b.shader->info.workgroup_size[1] = 8;
35 b.shader->info.workgroup_size[2] = 1;
36 b.shader->info.cs.user_data_components_amd = 3;
37 b.shader->info.num_ssbos = 1;
38
39 /* Get user data SGPRs. */
40 nir_def *user_sgprs = nir_load_user_data_amd(&b);
41
42 /* Relative offset from the displayable DCC to the non-displayable DCC in the same buffer. */
43 nir_def *src_dcc_offset = nir_channel(&b, user_sgprs, 0);
44
45 nir_def *src_dcc_pitch, *dst_dcc_pitch, *src_dcc_height, *dst_dcc_height;
46 unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &src_dcc_pitch, &src_dcc_height);
47 unpack_2x16(&b, nir_channel(&b, user_sgprs, 2), &dst_dcc_pitch, &dst_dcc_height);
48
49 /* Get the 2D coordinates. */
50 nir_def *coord = ac_get_global_ids(&b, 2, 32);
51 nir_def *zero = nir_imm_int(&b, 0);
52
53 /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
54 coord = nir_imul(&b, coord, nir_imm_ivec2(&b, surf->u.gfx9.color.dcc_block_width,
55 surf->u.gfx9.color.dcc_block_height));
56
57 nir_def *src_offset =
58 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.dcc_equation,
59 src_dcc_pitch, src_dcc_height, zero, /* DCC slice size */
60 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
61 zero, zero, zero); /* z, sample, pipe_xor */
62 src_offset = nir_iadd(&b, src_offset, src_dcc_offset);
63 nir_def *value = nir_load_ssbo(&b, 1, 8, zero, src_offset, .align_mul=1);
64
65 nir_def *dst_offset =
66 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, surf->bpe, &surf->u.gfx9.color.display_dcc_equation,
67 dst_dcc_pitch, dst_dcc_height, zero, /* DCC slice size */
68 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
69 zero, zero, zero); /* z, sample, pipe_xor */
70 nir_store_ssbo(&b, value, zero, dst_offset, .write_mask=0x1, .align_mul=1);
71
72 return si_create_shader_state(sctx, b.shader);
73 }
74
gfx9_create_clear_dcc_msaa_cs(struct si_context * sctx,struct si_texture * tex)75 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
76 {
77 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
78 "clear_dcc_msaa");
79 b.shader->info.workgroup_size[0] = 8;
80 b.shader->info.workgroup_size[1] = 8;
81 b.shader->info.workgroup_size[2] = 1;
82 b.shader->info.cs.user_data_components_amd = 2;
83 b.shader->info.num_ssbos = 1;
84
85 /* Get user data SGPRs. */
86 nir_def *user_sgprs = nir_load_user_data_amd(&b);
87 nir_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
88 unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
89 unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
90 clear_value = nir_u2u16(&b, clear_value);
91
92 /* Get the 2D coordinates. */
93 nir_def *coord = ac_get_global_ids(&b, 3, 32);
94 nir_def *zero = nir_imm_int(&b, 0);
95
96 /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
97 coord = nir_imul(&b, coord,
98 nir_imm_ivec3(&b, tex->surface.u.gfx9.color.dcc_block_width,
99 tex->surface.u.gfx9.color.dcc_block_height,
100 tex->surface.u.gfx9.color.dcc_block_depth));
101
102 nir_def *offset =
103 ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
104 &tex->surface.u.gfx9.color.dcc_equation,
105 dcc_pitch, dcc_height, zero, /* DCC slice size */
106 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
107 tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
108 zero, pipe_xor); /* sample, pipe_xor */
109
110 /* The trick here is that DCC elements for an even and the next odd sample are next to each other
111 * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
112 * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
113 */
114 nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
115
116 return si_create_shader_state(sctx, b.shader);
117 }
118
119 /* Create a compute shader implementing clear_buffer or copy_buffer. */
si_create_clear_buffer_rmw_cs(struct si_context * sctx)120 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
121 {
122 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
123 "clear_buffer_rmw_cs");
124 b.shader->info.workgroup_size[0] = 64;
125 b.shader->info.workgroup_size[1] = 1;
126 b.shader->info.workgroup_size[2] = 1;
127 b.shader->info.cs.user_data_components_amd = 2;
128 b.shader->info.num_ssbos = 1;
129
130 /* address = blockID * 64 + threadID; */
131 nir_def *address = ac_get_global_ids(&b, 1, 32);
132
133 /* address = address * 16; (byte offset, loading one vec4 per thread) */
134 address = nir_ishl_imm(&b, address, 4);
135
136 nir_def *zero = nir_imm_int(&b, 0);
137 nir_def *data = nir_load_ssbo(&b, 4, 32, zero, address, .align_mul = 4);
138
139 /* Get user data SGPRs. */
140 nir_def *user_sgprs = nir_load_user_data_amd(&b);
141
142 /* data &= inverted_writemask; */
143 data = nir_iand(&b, data, nir_channel(&b, user_sgprs, 1));
144 /* data |= clear_value_masked; */
145 data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
146
147 nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
148
149 return si_create_shader_state(sctx, b.shader);
150 }
151
152 /* This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
153 * VS passes its outputs to TES directly, so the fixed-function shader only
154 * has to write TESSOUTER and TESSINNER.
155 */
si_create_passthrough_tcs(struct si_context * sctx)156 void *si_create_passthrough_tcs(struct si_context *sctx)
157 {
158 unsigned locations[PIPE_MAX_SHADER_OUTPUTS];
159
160 struct si_shader_info *info = &sctx->shader.vs.cso->info;
161 for (unsigned i = 0; i < info->num_outputs; i++) {
162 locations[i] = info->output_semantic[i];
163 }
164
165 nir_shader *tcs = nir_create_passthrough_tcs_impl(sctx->screen->nir_options, locations,
166 info->num_outputs, sctx->patch_vertices);
167
168 return si_create_shader_state(sctx, tcs);
169 }
170
171 /* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
172 * to GFX11_DCC_CLEAR_SINGLE.
173 */
si_clear_image_dcc_single_shader(struct si_context * sctx,bool is_msaa,unsigned wg_dim)174 void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim)
175 {
176 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
177 "write_clear_color_dcc_single");
178 b.shader->info.num_images = 1;
179 if (is_msaa)
180 BITSET_SET(b.shader->info.msaa_images, 0);
181 b.shader->info.workgroup_size[0] = 8;
182 b.shader->info.workgroup_size[1] = 8;
183 b.shader->info.cs.user_data_components_amd = 5;
184
185 const struct glsl_type *img_type =
186 glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
187 nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
188 output_img->data.binding = 0;
189
190 nir_def *global_id = nir_pad_vector_imm_int(&b, ac_get_global_ids(&b, wg_dim, 32), 0, 3);
191 nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4);
192
193 nir_def *dcc_block_width, *dcc_block_height;
194 unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width,
195 &dcc_block_height);
196
197 /* Compute the coordinates. */
198 nir_def *coord = nir_trim_vector(&b, global_id, 2);
199 coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height));
200 coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
201 nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32));
202
203 /* Store the clear color. */
204 nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
205 clear_color, nir_imm_int(&b, 0),
206 .image_dim = img_type->sampler_dimensionality,
207 .image_array = img_type->sampler_array);
208
209 return si_create_shader_state(sctx, b.shader);
210 }
211
si_create_ubyte_to_ushort_compute_shader(struct si_context * sctx)212 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
213 {
214 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
215 "ubyte_to_ushort");
216 b.shader->info.workgroup_size[0] = 64;
217 b.shader->info.workgroup_size[1] = 1;
218 b.shader->info.workgroup_size[2] = 1;
219 b.shader->info.num_ssbos = 2;
220
221 nir_def *load_address = ac_get_global_ids(&b, 1, 32);
222 nir_def *store_address = nir_imul_imm(&b, load_address, 2);
223
224 nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
225 load_address, .access = ACCESS_RESTRICT);
226 nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
227 store_address, .access = ACCESS_RESTRICT);
228
229 return si_create_shader_state(sctx, b.shader);
230 }
231
232 /* Load samples from the image, and copy them to the same image. This looks like
233 * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
234 * reordered to match expanded FMASK.
235 *
236 * After the shader finishes, FMASK should be cleared to identity.
237 */
si_create_fmask_expand_cs(struct si_context * sctx,unsigned num_samples,bool is_array)238 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array)
239 {
240 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
241 "create_fmask_expand_cs");
242 b.shader->info.workgroup_size[0] = 8;
243 b.shader->info.workgroup_size[1] = 8;
244 b.shader->info.workgroup_size[2] = 1;
245
246 /* Return an empty compute shader */
247 if (num_samples == 0)
248 return si_create_shader_state(sctx, b.shader);
249
250 b.shader->info.num_images = 1;
251
252 const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_MS, is_array, GLSL_TYPE_FLOAT);
253 nir_variable *img = nir_variable_create(b.shader, nir_var_image, img_type, "image");
254 img->data.access = ACCESS_RESTRICT;
255
256 nir_def *z = nir_undef(&b, 1, 32);
257 if (is_array) {
258 z = nir_channel(&b, nir_load_workgroup_id(&b), 2);
259 }
260
261 nir_def *zero_lod = nir_imm_int(&b, 0);
262 nir_def *address = ac_get_global_ids(&b, 2, 32);
263
264 nir_def *coord[8], *values[8];
265 assert(num_samples <= ARRAY_SIZE(coord));
266
267 nir_def *img_deref = &nir_build_deref_var(&b, img)->def;
268
269 /* Load samples, resolving FMASK. */
270 for (unsigned i = 0; i < num_samples; i++) {
271 nir_def *sample = nir_imm_int(&b, i);
272 coord[i] = nir_vec4(&b, nir_channel(&b, address, 0), nir_channel(&b, address, 1), z,
273 nir_undef(&b, 1, 32));
274 values[i] = nir_image_deref_load(&b, 4, 32, img_deref, coord[i], sample, zero_lod,
275 .access = ACCESS_RESTRICT,
276 .image_dim = GLSL_SAMPLER_DIM_2D,
277 .image_array = is_array);
278 }
279
280 /* Store samples, ignoring FMASK. */
281 for (unsigned i = 0; i < num_samples; i++) {
282 nir_def *sample = nir_imm_int(&b, i);
283 nir_image_deref_store(&b, img_deref, coord[i], sample, values[i], zero_lod,
284 .access = ACCESS_RESTRICT,
285 .image_dim = GLSL_SAMPLER_DIM_2D,
286 .image_array = is_array);
287 }
288
289 return si_create_shader_state(sctx, b.shader);
290 }
291
292 /* This is just a pass-through shader with 1-3 MOV instructions. */
si_get_blitter_vs(struct si_context * sctx,enum blitter_attrib_type type,unsigned num_layers)293 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
294 {
295 unsigned vs_blit_property;
296 void **vs;
297
298 switch (type) {
299 case UTIL_BLITTER_ATTRIB_NONE:
300 vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
301 vs_blit_property = SI_VS_BLIT_SGPRS_POS;
302 break;
303 case UTIL_BLITTER_ATTRIB_COLOR:
304 vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
305 vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
306 break;
307 case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
308 case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
309 assert(num_layers == 1);
310 vs = &sctx->vs_blit_texcoord;
311 vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
312 break;
313 default:
314 assert(0);
315 return NULL;
316 }
317
318 if (*vs)
319 return *vs;
320
321 /* Add 1 for the attribute ring address. */
322 if (sctx->gfx_level >= GFX11 && type != UTIL_BLITTER_ATTRIB_NONE)
323 vs_blit_property++;
324
325 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, sctx->screen->nir_options,
326 "get_blitter_vs");
327
328 /* Tell the shader to load VS inputs from SGPRs: */
329 b.shader->info.vs.blit_sgprs_amd = vs_blit_property;
330 b.shader->info.vs.window_space_position = true;
331 b.shader->info.io_lowered = true;
332
333 nir_def *pos = nir_load_input(&b, 4, 32, nir_imm_int(&b, 0),
334 .dest_type = nir_type_float32,
335 .io_semantics.num_slots = 1,
336 .io_semantics.location = VERT_ATTRIB_GENERIC0);
337 nir_store_output(&b, pos, nir_imm_int(&b, 0),
338 .src_type = nir_type_float32,
339 .io_semantics.num_slots = 1,
340 .io_semantics.location = VARYING_SLOT_POS);
341
342 if (type != UTIL_BLITTER_ATTRIB_NONE) {
343 nir_def *attr = nir_load_input(&b, 4, 32, nir_imm_int(&b, 0),
344 .dest_type = nir_type_float32,
345 .io_semantics.num_slots = 1,
346 .io_semantics.location = VERT_ATTRIB_GENERIC1);
347 nir_store_output(&b, attr, nir_imm_int(&b, 0),
348 .src_type = nir_type_float32,
349 .io_semantics.num_slots = 1,
350 .io_semantics.location = VARYING_SLOT_VAR0);
351 }
352
353 if (num_layers > 1) {
354 nir_store_output(&b, nir_load_instance_id(&b), nir_imm_int(&b, 0),
355 .src_type = nir_type_float32,
356 .io_semantics.num_slots = 1,
357 .io_semantics.location = VARYING_SLOT_LAYER);
358 }
359
360 NIR_PASS(_, b.shader, nir_recompute_io_bases, nir_var_shader_in | nir_var_shader_out);
361
362 *vs = si_create_shader_state(sctx, b.shader);
363 return *vs;
364 }
365
366 /* Create the compute shader that is used to collect the results.
367 *
368 * One compute grid with a single thread is launched for every query result
369 * buffer. The thread (optionally) reads a previous summary buffer, then
370 * accumulates data from the query result buffer, and writes the result either
371 * to a summary buffer to be consumed by the next grid invocation or to the
372 * user-supplied buffer.
373 *
374 * Data layout:
375 *
376 * CONST
377 * 0.x = end_offset
378 * 0.y = result_stride
379 * 0.z = result_count
380 * 0.w = bit field:
381 * 1: read previously accumulated values
382 * 2: write accumulated values for chaining
383 * 4: write result available
384 * 8: convert result to boolean (0/1)
385 * 16: only read one dword and use that as result
386 * 32: apply timestamp conversion
387 * 64: store full 64 bits result
388 * 128: store signed 32 bits result
389 * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
390 * 1.x = fence_offset
391 * 1.y = pair_stride
392 * 1.z = pair_count
393 *
394 */
si_create_query_result_cs(struct si_context * sctx)395 void *si_create_query_result_cs(struct si_context *sctx)
396 {
397 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
398 "create_query_result_cs");
399 b.shader->info.workgroup_size[0] = 1;
400 b.shader->info.workgroup_size[1] = 1;
401 b.shader->info.workgroup_size[2] = 1;
402 b.shader->info.num_ubos = 1;
403 b.shader->info.num_ssbos = 3;
404 b.shader->num_uniforms = 2;
405
406 nir_def *var_undef = nir_undef(&b, 1, 32);
407 nir_def *zero = nir_imm_int(&b, 0);
408 nir_def *one = nir_imm_int(&b, 1);
409 nir_def *two = nir_imm_int(&b, 2);
410 nir_def *four = nir_imm_int(&b, 4);
411 nir_def *eight = nir_imm_int(&b, 8);
412 nir_def *sixteen = nir_imm_int(&b, 16);
413 nir_def *thirty_one = nir_imm_int(&b, 31);
414 nir_def *sixty_four = nir_imm_int(&b, 64);
415
416 /* uint32_t x, y, z = 0; */
417 nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
418 nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
419 nir_store_var(&b, x, var_undef, 0x1);
420 nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
421 nir_store_var(&b, y, var_undef, 0x1);
422 nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
423 nir_store_var(&b, z, zero, 0x1);
424
425 /* uint32_t buff_0[4] = load_ubo(0, 0); */
426 nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
427 /* uint32_t buff_1[4] = load_ubo(1, 16); */
428 nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
429
430 /* uint32_t b0_bitfield = buff_0.w; */
431 nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
432
433 /* Check result availability.
434 * if (b0_bitfield & (1u << 4)) {
435 * ...
436 */
437 nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
438 nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
439
440 /* int32_t value = load_ssbo(0, fence_offset);
441 * z = ~(value >> 31);
442 */
443 nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
444 nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
445 nir_store_var(&b, z, bitmask, 0x1);
446
447 /* Load result if available.
448 * if (value < 0) {
449 * uint32_t result[2] = load_ssbo(0, 0);
450 * x = result[0];
451 * y = result[1];
452 * }
453 */
454 nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
455 nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
456 nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
457 nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
458 }
459 nir_pop_if(&b, if_negative);
460 } nir_push_else(&b, if_one_dword_result); {
461
462 /* } else {
463 * x = 0; y = 0;
464 */
465 nir_store_var(&b, x, zero, 0x1);
466 nir_store_var(&b, y, zero, 0x1);
467
468 /* Load previously accumulated result if requested.
469 * if (b0_bitfield & (1u << 0)) {
470 * uint32_t result[3] = load_ssbo(1, 0);
471 * x = result[0];
472 * y = result[1];
473 * z = result[2];
474 * }
475 */
476 nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
477 nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
478 nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
479 nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
480 nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
481 nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
482 }
483 nir_pop_if(&b, if_prev_acc_result);
484
485 /* if (!z) {
486 * uint32_t result_index = 0;
487 * uint32_t pitch = 0;
488 * ...
489 */
490 nir_def *z_value = nir_load_var(&b, z);
491 nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
492 nir_variable *outer_loop_iter =
493 nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
494 nir_store_var(&b, outer_loop_iter, zero, 0x1);
495 nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
496 nir_store_var(&b, pitch, zero, 0x1);
497
498 /* Outer loop.
499 * while (result_index <= result_count) {
500 * ...
501 */
502 nir_loop *loop_outer = nir_push_loop(&b); {
503 nir_def *result_index = nir_load_var(&b, outer_loop_iter);
504 nir_def *is_result_index_out_of_bound =
505 nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
506 nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
507 nir_jump(&b, nir_jump_break);
508 }
509 nir_pop_if(&b, if_out_of_bound);
510
511 /* Load fence and check result availability.
512 * pitch = i * result_stride;
513 * uint32_t address = fence_offset + pitch;
514 * int32_t value = load_ssbo(0, address);
515 * z = ~(value >> 31);
516 */
517 nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
518 nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
519 nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
520 nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
521 nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
522 nir_store_var(&b, z, bitmask, 0x1);
523
524 /* if (z) {
525 * break;
526 * }
527 */
528 nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
529 nir_jump(&b, nir_jump_break);
530 }
531 nir_pop_if(&b, if_result_available);
532
533 /* Inner loop iterator.
534 * uint32_t i = 0;
535 */
536 nir_variable *inner_loop_iter =
537 nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
538 nir_store_var(&b, inner_loop_iter, zero, 0x1);
539
540 /* Inner loop.
541 * do {
542 * ...
543 */
544 nir_loop *loop_inner = nir_push_loop(&b); {
545 nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
546 nir_def *i = nir_load_var(&b, inner_loop_iter);
547
548 /* Load start and end.
549 * uint64_t first = load_ssbo(0, pitch);
550 * uint64_t second = load_ssbo(0, pitch + end_offset);
551 * uint64_t start_half_pair = second - first;
552 */
553 nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
554 nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
555 nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
556 nir_def *start_half_pair = nir_isub(&b, second, first);
557
558 /* Load second start/end half-pair and take the difference.
559 * if (b0_bitfield & (1u << 8)) {
560 * uint64_t first = load_ssbo(0, pitch + 8);
561 * uint64_t second = load_ssbo(0, pitch + end_offset + 8);
562 * uint64_t end_half_pair = second - first;
563 * uint64_t difference = start_half_pair - end_half_pair;
564 * }
565 */
566 nir_def *difference;
567 nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
568 nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
569 first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
570 second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
571 nir_def *end_half_pair = nir_isub(&b, second, first);
572 difference = nir_isub(&b, start_half_pair, end_half_pair);
573 }
574 nir_pop_if(&b, if_so_overflow_mode);
575
576 /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
577 nir_def *sum = nir_iadd(&b,
578 nir_pack_64_2x32_split(&b,
579 nir_load_var(&b, x),
580 nir_load_var(&b, y)),
581 nir_if_phi(&b, difference, start_half_pair));
582 sum = nir_unpack_64_2x32(&b, sum);
583
584 /* Increment inner loop iterator.
585 * i++;
586 */
587 i = nir_iadd(&b, i, one);
588 nir_store_var(&b, inner_loop_iter, i, 0x1);
589
590 /* Update pitch value.
591 * pitch = i * pair_stride + pitch;
592 */
593 nir_def *incremented_pitch = nir_iadd(&b,
594 nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
595 pitch_outer_loop);
596 nir_store_var(&b, pitch, incremented_pitch, 0x1);
597
598 /* Update x and y.
599 * x = sum.x;
600 * y = sum.x >> 32;
601 */
602 nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
603 nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
604
605 /* } while (i < pair_count);
606 */
607 nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
608 nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
609 nir_jump(&b, nir_jump_break);
610 }
611 nir_pop_if(&b, if_pair_count_exceeded);
612 }
613 nir_pop_loop(&b, loop_inner);
614
615 /* Increment pair iterator.
616 * result_index++;
617 */
618 nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
619 }
620 nir_pop_loop(&b, loop_outer);
621 }
622 nir_pop_if(&b, if_not_z);
623 }
624 nir_pop_if(&b, if_one_dword_result);
625
626 nir_def *x_value = nir_load_var(&b, x);
627 nir_def *y_value = nir_load_var(&b, y);
628 nir_def *z_value = nir_load_var(&b, z);
629
630 /* Store accumulated data for chaining.
631 * if (b0_bitfield & (1u << 1)) {
632 * store_ssbo(<x, y, z>, 2, 0);
633 */
634 nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
635 nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
636 nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
637 } nir_push_else(&b, if_acc_chaining); {
638
639 /* Store result availability.
640 * } else {
641 * if (b0_bitfield & (1u << 2)) {
642 * store_ssbo((~z & 1), 2, 0);
643 * ...
644 */
645 nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
646 nir_if *if_result_available = nir_push_if(&b, is_result_available); {
647 nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
648
649 /* Store full 64 bits result.
650 * if (b0_bitfield & (1u << 6)) {
651 * store_ssbo(<0, 0>, 2, 0);
652 * }
653 */
654 nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
655 nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
656 nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
657 .write_mask = (1u << 1));
658 }
659 nir_pop_if(&b, if_result_64_bits);
660 } nir_push_else(&b, if_result_available); {
661
662 /* } else {
663 * if (~z) {
664 * ...
665 */
666 nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
667 nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
668 nir_def *ts_x, *ts_y;
669
670 /* Apply timestamp conversion.
671 * if (b0_bitfield & (1u << 5)) {
672 * uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
673 * uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
674 * x = ts_converted.x;
675 * y = ts_converted.x >> 32;
676 * }
677 */
678 nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
679 nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
680 /* Add the frequency into the shader for timestamp conversion
681 * so that the backend can use the full range of optimizations
682 * for divide-by-constant.
683 */
684 nir_def *clock_crystal_frequency =
685 nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
686
687 nir_def *xy_million = nir_imul(&b,
688 nir_pack_64_2x32_split(&b, x_value, y_value),
689 nir_imm_int64(&b, 1000000));
690 nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
691 ts_converted = nir_unpack_64_2x32(&b, ts_converted);
692 ts_x = nir_channel(&b, ts_converted, 0);
693 ts_y = nir_channel(&b, ts_converted, 1);
694 }
695 nir_pop_if(&b, if_apply_timestamp);
696
697 nir_def *nx = nir_if_phi(&b, ts_x, x_value);
698 nir_def *ny = nir_if_phi(&b, ts_y, y_value);
699
700 /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
701 * y = b0_bitfield & (1u << 3) ? 0 : y;
702 */
703 nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
704 nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
705 nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
706 nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
707 ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
708
709 /* if (b0_bitfield & (1u << 6)) {
710 * store_ssbo(<x, y>, 2, 0);
711 * }
712 */
713 nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
714 nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
715 nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
716 } nir_push_else(&b, if_result_64_bits); {
717
718 /* Clamping.
719 * } else {
720 * x = y ? UINT32_MAX : x;
721 * x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
722 * store_ssbo(x, 2, 0);
723 * }
724 */
725 nir_def *is_y = nir_ine(&b, ny, zero);
726 nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
727 nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
728 nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
729 nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
730 nir_store_ssbo(&b, nx, two, zero);
731 }
732 nir_pop_if(&b, if_result_64_bits);
733 }
734 nir_pop_if(&b, if_bitwise_not_z);
735 }
736 nir_pop_if(&b, if_result_available);
737 }
738 nir_pop_if(&b, if_acc_chaining);
739
740 return si_create_shader_state(sctx, b.shader);
741 }
742
743 /* Create the compute shader that is used to collect the results of gfx10+
744 * shader queries.
745 *
746 * One compute grid with a single thread is launched for every query result
747 * buffer. The thread (optionally) reads a previous summary buffer, then
748 * accumulates data from the query result buffer, and writes the result either
749 * to a summary buffer to be consumed by the next grid invocation or to the
750 * user-supplied buffer.
751 *
752 * Data layout:
753 *
754 * CONST
755 * 0.x = config;
756 * [0:2] the low 3 bits indicate the mode:
757 * 0: sum up counts
758 * 1: determine result availability and write it as a boolean
759 * 2: SO_OVERFLOW
760 * 3: SO_ANY_OVERFLOW
761 * the remaining bits form a bitfield:
762 * 8: write result as a 64-bit value
763 * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
764 * 0.z = chain bit field:
765 * 1: have previous summary buffer
766 * 2: write next summary buffer
767 * 0.w = result_count
768 */
gfx11_create_sh_query_result_cs(struct si_context * sctx)769 void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
770 {
771 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
772 "gfx11_create_sh_query_result_cs");
773 b.shader->info.workgroup_size[0] = 1;
774 b.shader->info.workgroup_size[1] = 1;
775 b.shader->info.workgroup_size[2] = 1;
776 b.shader->info.num_ubos = 1;
777 b.shader->info.num_ssbos = 3;
778 b.shader->num_uniforms = 1;
779
780 nir_def *zero = nir_imm_int(&b, 0);
781 nir_def *one = nir_imm_int(&b, 1);
782 nir_def *two = nir_imm_int(&b, 2);
783 nir_def *four = nir_imm_int(&b, 4);
784 nir_def *minus_one = nir_imm_int(&b, 0xffffffff);
785
786 /* uint32_t acc_result = 0, acc_missing = 0; */
787 nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
788 nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result");
789 nir_store_var(&b, acc_result, zero, 0x1);
790 nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing");
791 nir_store_var(&b, acc_missing, zero, 0x1);
792
793 /* uint32_t buff_0[4] = load_ubo(0, 0); */
794 nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
795
796 /* if((chain & 1) {
797 * uint32_t result[2] = load_ssbo(1, 0);
798 * acc_result = result[0];
799 * acc_missing = result[1];
800 * }
801 */
802 nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one));
803 nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); {
804 nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero);
805 nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1);
806 nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1);
807 }
808 nir_pop_if(&b, if_prev_summary_buffer);
809
810 /* uint32_t mode = config & 0b111;
811 * bool is_overflow = mode >= 2;
812 */
813 nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111);
814 nir_def *is_overflow = nir_uge(&b, mode, two);
815
816 /* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */
817 nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining");
818 nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset");
819 nir_def *state = nir_iand(&b,
820 nir_isub(&b, zero, nir_b2i32(&b, is_overflow)),
821 nir_load_var(&b, acc_result));
822 nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3));
823 nir_store_var(&b, result_remaining, value, 0x1);
824
825 /* uint32_t base_offset = 0; */
826 nir_store_var(&b, base_offset, zero, 0x1);
827
828 /* Outer loop begin.
829 * while (!result_remaining) {
830 * ...
831 */
832 nir_loop *loop_outer = nir_push_loop(&b); {
833 nir_def *condition = nir_load_var(&b, result_remaining);
834 nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); {
835 nir_jump(&b, nir_jump_break);
836 }
837 nir_pop_if(&b, if_not_condition);
838
839 /* result_remaining--; */
840 condition = nir_iadd(&b, condition, minus_one);
841 nir_store_var(&b, result_remaining, condition, 0x1);
842
843 /* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */
844 nir_def *b_offset = nir_load_var(&b, base_offset);
845 uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream);
846 nir_def *fence = nir_load_ssbo(&b, 1, 32, zero,
847 nir_iadd_imm(&b, b_offset, buffer_mem_stream_size));
848
849 /* if (!fence) {
850 * acc_missing = ~0u;
851 * break;
852 * }
853 */
854 nir_def *is_zero = nir_ieq(&b, fence, zero);
855 nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero));
856 nir_store_var(&b, acc_missing, y_value, 0x1);
857 nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); {
858 nir_jump(&b, nir_jump_break);
859 }
860 nir_pop_if(&b, if_ssbo_zero);
861
862 /* stream_offset = base_offset + offset; */
863 nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1));
864
865 /* if (!(config & 7)) {
866 * acc_result += buffer[0]@stream_offset;
867 * }
868 */
869 nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); {
870 nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset);
871 x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value);
872 nir_store_var(&b, acc_result, x_value, 0x1);
873 }
874 nir_pop_if(&b, if_sum_up_counts);
875
876 /* if (is_overflow) {
877 * uint32_t count = (config & 1) ? 4 : 1;
878 * ...
879 */
880 nir_if *if_overflow = nir_push_if(&b, is_overflow); {
881 nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one));
882 nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one);
883
884 nir_variable *count =
885 nir_local_variable_create(e, glsl_uint_type(), "count");
886 nir_store_var(&b, count, initial_count, 0x1);
887
888 nir_variable *stream_offset =
889 nir_local_variable_create(e, glsl_uint_type(), "stream_offset");
890 nir_store_var(&b, stream_offset, s_offset, 0x1);
891
892 /* Inner loop begin.
893 * do {
894 * ...
895 */
896 nir_loop *loop_inner = nir_push_loop(&b); {
897 /* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */
898 nir_def *stream_offset_value = nir_load_var(&b, stream_offset);
899 nir_def *buffer =
900 nir_load_ssbo(&b, 4, 32, zero,
901 nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t)));
902
903 /* if (generated != emitted) {
904 * acc_result = 1;
905 * base_offset = 0;
906 * break;
907 * }
908 */
909 nir_def *generated = nir_channel(&b, buffer, 0);
910 nir_def *emitted = nir_channel(&b, buffer, 2);
911 nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); {
912 nir_store_var(&b, acc_result, one, 0x1);
913 nir_store_var(&b, base_offset, zero, 0x1);
914 nir_jump(&b, nir_jump_break);
915 }
916 nir_pop_if(&b, if_not_equal);
917
918 /* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */
919 uint64_t buffer_mem_stream0_size =
920 sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]);
921 stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size);
922 nir_store_var(&b, stream_offset, stream_offset_value, 0x1);
923
924 /* } while(count--); */
925 nir_def *loop_count = nir_load_var(&b, count);
926 loop_count = nir_iadd(&b, loop_count, minus_one);
927 nir_store_var(&b, count, loop_count, 0x1);
928
929 nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); {
930 nir_jump(&b, nir_jump_break);
931 }
932 nir_pop_if(&b, if_zero);
933 }
934 nir_pop_loop(&b, loop_inner); /* Inner loop end */
935 }
936 nir_pop_if(&b, if_overflow);
937
938 /* base_offset += sizeof(gfx11_sh_query_buffer_mem); */
939 nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem));
940 nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1);
941 }
942 nir_pop_loop(&b, loop_outer); /* Outer loop end */
943
944 nir_def *acc_result_value = nir_load_var(&b, acc_result);
945 nir_def *y_value = nir_load_var(&b, acc_missing);
946
947 /* if ((chain & 2)) {
948 * store_ssbo(<acc_result, acc_missing>, 2, 0);
949 * ...
950 */
951 nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two));
952 nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); {
953 nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero);
954 } nir_push_else(&b, if_write_summary_buffer); {
955
956 /* } else {
957 * if ((config & 7) == 1) {
958 * acc_result = acc_missing ? 0 : 1;
959 * acc_missing = 0;
960 * }
961 * ...
962 */
963 nir_def *is_result_available = nir_ieq(&b, mode, one);
964 nir_def *is_zero = nir_ieq(&b, y_value, zero);
965 acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value);
966 nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value);
967
968 /* if (!acc_missing) {
969 * store_ssbo(acc_result, 2, 0);
970 * if (config & 8)) {
971 * store_ssbo(0, 2, 4)
972 * }
973 * }
974 */
975 nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); {
976 nir_store_ssbo(&b, acc_result_value, two, zero);
977
978 nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8));
979 nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); {
980 nir_store_ssbo(&b, zero, two, four);
981 }
982 nir_pop_if(&b, if_so_any_overflow);
983 }
984 nir_pop_if(&b, if_zero);
985 }
986 nir_pop_if(&b, if_write_summary_buffer);
987
988 return si_create_shader_state(sctx, b.shader);
989 }
990