• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir.h"
8 #include "ac_nir_helpers.h"
9 #include "sid.h"
10 
11 #include "nir_builder.h"
12 #include "nir_xfb_info.h"
13 
14 void
ac_nir_store_var_components(nir_builder * b,nir_variable * var,nir_def * value,unsigned component,unsigned writemask)15 ac_nir_store_var_components(nir_builder *b, nir_variable *var, nir_def *value,
16                             unsigned component, unsigned writemask)
17 {
18    /* component store */
19    if (value->num_components != 4) {
20       nir_def *undef = nir_undef(b, 1, value->bit_size);
21 
22       /* add undef component before and after value to form a vec4 */
23       nir_def *comp[4];
24       for (int i = 0; i < 4; i++) {
25          comp[i] = (i >= component && i < component + value->num_components) ?
26             nir_channel(b, value, i - component) : undef;
27       }
28 
29       value = nir_vec(b, comp, 4);
30       writemask <<= component;
31    } else {
32       /* if num_component==4, there should be no component offset */
33       assert(component == 0);
34    }
35 
36    nir_store_var(b, var, value, writemask);
37 }
38 
39 unsigned
ac_nir_map_io_location(unsigned location,uint64_t mask,ac_nir_map_io_driver_location map_io)40 ac_nir_map_io_location(unsigned location,
41                        uint64_t mask,
42                        ac_nir_map_io_driver_location map_io)
43 {
44    /* Unlinked shaders:
45     * We are unaware of the inputs of the next stage while lowering outputs.
46     * The driver needs to pass a callback to map varyings to a fixed location.
47     */
48    if (map_io)
49       return map_io(location);
50 
51    /* Linked shaders:
52     * Take advantage of knowledge of the inputs of the next stage when lowering outputs.
53     * Map varyings to a prefix sum of the IO mask to save space in LDS or VRAM.
54     */
55    assert(mask & BITFIELD64_BIT(location));
56    return util_bitcount64(mask & BITFIELD64_MASK(location));
57 }
58 
59 /**
60  * This function takes an I/O intrinsic like load/store_input,
61  * and emits a sequence that calculates the full offset of that instruction,
62  * including a stride to the base and component offsets.
63  */
64 nir_def *
ac_nir_calc_io_off(nir_builder * b,nir_intrinsic_instr * intrin,nir_def * base_stride,unsigned component_stride,unsigned mapped_driver_location)65 ac_nir_calc_io_off(nir_builder *b,
66                              nir_intrinsic_instr *intrin,
67                              nir_def *base_stride,
68                              unsigned component_stride,
69                              unsigned mapped_driver_location)
70 {
71    /* base is the driver_location, which is in slots (1 slot = 4x4 bytes) */
72    nir_def *base_op = nir_imul_imm(b, base_stride, mapped_driver_location);
73 
74    /* offset should be interpreted in relation to the base,
75     * so the instruction effectively reads/writes another input/output
76     * when it has an offset
77     */
78    nir_def *offset_op = nir_imul(b, base_stride,
79                                  nir_get_io_offset_src(intrin)->ssa);
80 
81    /* component is in bytes */
82    unsigned const_op = nir_intrinsic_component(intrin) * component_stride;
83 
84    return nir_iadd_imm_nuw(b, nir_iadd_nuw(b, base_op, offset_op), const_op);
85 }
86 
87 /* Process the given store_output intrinsic and process its information.
88  * Meant to be used for VS/TES/GS when they are the last pre-rasterization stage.
89  *
90  * Assumptions:
91  * - We called nir_lower_io_to_temporaries on the shader
92  * - 64-bit outputs are lowered
93  * - no indirect indexing is present
94  */
ac_nir_gather_prerast_store_output_info(nir_builder * b,nir_intrinsic_instr * intrin,ac_nir_prerast_out * out)95 void ac_nir_gather_prerast_store_output_info(nir_builder *b, nir_intrinsic_instr *intrin, ac_nir_prerast_out *out)
96 {
97    assert(intrin->intrinsic == nir_intrinsic_store_output);
98    assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
99 
100    const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
101    const unsigned slot = io_sem.location;
102 
103    nir_def *store_val = intrin->src[0].ssa;
104    assert(store_val->bit_size == 16 || store_val->bit_size == 32);
105 
106    nir_def **output;
107    nir_alu_type *type;
108    ac_nir_prerast_per_output_info *info;
109 
110    if (slot >= VARYING_SLOT_VAR0_16BIT) {
111       const unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
112 
113       if (io_sem.high_16bits) {
114          output = out->outputs_16bit_hi[index];
115          type = out->types_16bit_hi[index];
116          info = &out->infos_16bit_hi[index];
117       } else {
118          output = out->outputs_16bit_lo[index];
119          type = out->types_16bit_lo[index];
120          info = &out->infos_16bit_lo[index];
121       }
122    } else {
123       output = out->outputs[slot];
124       type = out->types[slot];
125       info = &out->infos[slot];
126    }
127 
128    unsigned component_offset = nir_intrinsic_component(intrin);
129    unsigned write_mask = nir_intrinsic_write_mask(intrin);
130    nir_alu_type src_type = nir_intrinsic_src_type(intrin);
131    assert(nir_alu_type_get_type_size(src_type) == store_val->bit_size);
132 
133    b->cursor = nir_before_instr(&intrin->instr);
134 
135    /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */
136    const bool non_dedicated_16bit = slot < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16;
137 
138    u_foreach_bit (i, write_mask) {
139       const unsigned stream = (io_sem.gs_streams >> (i * 2)) & 0x3;
140 
141       if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
142          if (!(b->shader->info.gs.active_stream_mask & (1 << stream)))
143             continue;
144       }
145 
146       const unsigned c = component_offset + i;
147 
148       /* The same output component should always belong to the same stream. */
149       assert(!(info->components_mask & (1 << c)) ||
150              ((info->stream >> (c * 2)) & 3) == stream);
151 
152       /* Components of the same output slot may belong to different streams. */
153       info->stream |= stream << (c * 2);
154       info->components_mask |= BITFIELD_BIT(c);
155 
156       if (!io_sem.no_varying)
157          info->as_varying_mask |= BITFIELD_BIT(c);
158       if (!io_sem.no_sysval_output)
159          info->as_sysval_mask |= BITFIELD_BIT(c);
160 
161       nir_def *store_component = nir_channel(b, intrin->src[0].ssa, i);
162 
163       if (non_dedicated_16bit) {
164          if (io_sem.high_16bits) {
165             nir_def *lo = output[c] ? nir_unpack_32_2x16_split_x(b, output[c]) : nir_imm_intN_t(b, 0, 16);
166             output[c] = nir_pack_32_2x16_split(b, lo, store_component);
167          } else {
168             nir_def *hi = output[c] ? nir_unpack_32_2x16_split_y(b, output[c]) : nir_imm_intN_t(b, 0, 16);
169             output[c] = nir_pack_32_2x16_split(b, store_component, hi);
170          }
171          type[c] = nir_type_uint32;
172       } else {
173          output[c] = store_component;
174          type[c] = src_type;
175       }
176    }
177 }
178 
179 static nir_intrinsic_instr *
export(nir_builder * b,nir_def * val,nir_def * row,unsigned base,unsigned flags,unsigned write_mask)180 export(nir_builder *b, nir_def *val, nir_def *row, unsigned base, unsigned flags,
181        unsigned write_mask)
182 {
183    if (row) {
184       return nir_export_row_amd(b, val, row, .base = base, .flags = flags,
185                                 .write_mask = write_mask);
186    } else {
187       return nir_export_amd(b, val, .base = base, .flags = flags,
188                             .write_mask = write_mask);
189    }
190 }
191 
192 void
ac_nir_export_primitive(nir_builder * b,nir_def * prim,nir_def * row)193 ac_nir_export_primitive(nir_builder *b, nir_def *prim, nir_def *row)
194 {
195    unsigned write_mask = BITFIELD_MASK(prim->num_components);
196 
197    export(b, nir_pad_vec4(b, prim), row, V_008DFC_SQ_EXP_PRIM, AC_EXP_FLAG_DONE,
198           write_mask);
199 }
200 
201 static nir_def *
get_export_output(nir_builder * b,nir_def ** output)202 get_export_output(nir_builder *b, nir_def **output)
203 {
204    nir_def *vec[4];
205    for (int i = 0; i < 4; i++) {
206       if (output[i])
207          vec[i] = nir_u2uN(b, output[i], 32);
208       else
209          vec[i] = nir_undef(b, 1, 32);
210    }
211 
212    return nir_vec(b, vec, 4);
213 }
214 
215 static nir_def *
get_pos0_output(nir_builder * b,nir_def ** output)216 get_pos0_output(nir_builder *b, nir_def **output)
217 {
218    /* Some applications don't write position but expect (0, 0, 0, 1)
219     * so use that value instead of undef when it isn't written.
220     */
221    nir_def *vec[4] = {0};
222 
223    for (int i = 0; i < 4; i++) {
224       if (output[i])
225          vec[i] = nir_u2u32(b, output[i]);
226      else
227          vec[i] = nir_imm_float(b, i == 3 ? 1.0 : 0.0);
228    }
229 
230    return nir_vec(b, vec, 4);
231 }
232 
233 void
ac_nir_export_position(nir_builder * b,enum amd_gfx_level gfx_level,uint32_t clip_cull_mask,bool no_param_export,bool force_vrs,bool done,uint64_t outputs_written,ac_nir_prerast_out * out,nir_def * row)234 ac_nir_export_position(nir_builder *b,
235                        enum amd_gfx_level gfx_level,
236                        uint32_t clip_cull_mask,
237                        bool no_param_export,
238                        bool force_vrs,
239                        bool done,
240                        uint64_t outputs_written,
241                        ac_nir_prerast_out *out,
242                        nir_def *row)
243 {
244    nir_intrinsic_instr *exp[4];
245    unsigned exp_num = 0;
246    unsigned exp_pos_offset = 0;
247 
248    if (outputs_written & VARYING_BIT_POS) {
249       /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
250       * Setting valid_mask=1 prevents it and has no other effect.
251       */
252       const unsigned pos_flags = gfx_level == GFX10 ? AC_EXP_FLAG_VALID_MASK : 0;
253       nir_def *pos = get_pos0_output(b, out->outputs[VARYING_SLOT_POS]);
254 
255       exp[exp_num] = export(b, pos, row, V_008DFC_SQ_EXP_POS + exp_num, pos_flags, 0xf);
256       exp_num++;
257    } else {
258       exp_pos_offset++;
259    }
260 
261    uint64_t mask =
262       VARYING_BIT_PSIZ |
263       VARYING_BIT_EDGE |
264       VARYING_BIT_LAYER |
265       VARYING_BIT_VIEWPORT |
266       VARYING_BIT_PRIMITIVE_SHADING_RATE;
267 
268    /* clear output mask if no one written */
269    if (!out->outputs[VARYING_SLOT_PSIZ][0] || !out->infos[VARYING_SLOT_PSIZ].as_sysval_mask)
270       outputs_written &= ~VARYING_BIT_PSIZ;
271    if (!out->outputs[VARYING_SLOT_EDGE][0] || !out->infos[VARYING_SLOT_EDGE].as_sysval_mask)
272       outputs_written &= ~VARYING_BIT_EDGE;
273    if (!out->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE][0] || !out->infos[VARYING_SLOT_PRIMITIVE_SHADING_RATE].as_sysval_mask)
274       outputs_written &= ~VARYING_BIT_PRIMITIVE_SHADING_RATE;
275    if (!out->outputs[VARYING_SLOT_LAYER][0] || !out->infos[VARYING_SLOT_LAYER].as_sysval_mask)
276       outputs_written &= ~VARYING_BIT_LAYER;
277    if (!out->outputs[VARYING_SLOT_VIEWPORT][0] || !out->infos[VARYING_SLOT_VIEWPORT].as_sysval_mask)
278       outputs_written &= ~VARYING_BIT_VIEWPORT;
279 
280    if ((outputs_written & mask) || force_vrs) {
281       nir_def *zero = nir_imm_float(b, 0);
282       nir_def *vec[4] = { zero, zero, zero, zero };
283       unsigned write_mask = 0;
284 
285       if (outputs_written & VARYING_BIT_PSIZ) {
286          vec[0] = out->outputs[VARYING_SLOT_PSIZ][0];
287          write_mask |= BITFIELD_BIT(0);
288       }
289 
290       if (outputs_written & VARYING_BIT_EDGE) {
291          vec[1] = nir_umin(b, out->outputs[VARYING_SLOT_EDGE][0], nir_imm_int(b, 1));
292          write_mask |= BITFIELD_BIT(1);
293       }
294 
295       nir_def *rates = NULL;
296       if (outputs_written & VARYING_BIT_PRIMITIVE_SHADING_RATE) {
297          rates = out->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE][0];
298       } else if (force_vrs) {
299          /* If Pos.W != 1 (typical for non-GUI elements), use coarse shading. */
300          nir_def *pos_w = out->outputs[VARYING_SLOT_POS][3];
301          pos_w = pos_w ? nir_u2u32(b, pos_w) : nir_imm_float(b, 1.0);
302          nir_def *cond = nir_fneu_imm(b, pos_w, 1);
303          rates = nir_bcsel(b, cond, nir_load_force_vrs_rates_amd(b), nir_imm_int(b, 0));
304       }
305 
306       if (rates) {
307          vec[1] = nir_ior(b, vec[1], rates);
308          write_mask |= BITFIELD_BIT(1);
309       }
310 
311       if (outputs_written & VARYING_BIT_LAYER) {
312          vec[2] = out->outputs[VARYING_SLOT_LAYER][0];
313          write_mask |= BITFIELD_BIT(2);
314       }
315 
316       if (outputs_written & VARYING_BIT_VIEWPORT) {
317          if (gfx_level >= GFX9) {
318             /* GFX9 has the layer in [10:0] and the viewport index in [19:16]. */
319             nir_def *v = nir_ishl_imm(b, out->outputs[VARYING_SLOT_VIEWPORT][0], 16);
320             vec[2] = nir_ior(b, vec[2], v);
321             write_mask |= BITFIELD_BIT(2);
322          } else {
323             vec[3] = out->outputs[VARYING_SLOT_VIEWPORT][0];
324             write_mask |= BITFIELD_BIT(3);
325          }
326       }
327 
328       exp[exp_num] = export(b, nir_vec(b, vec, 4), row,
329                             V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset,
330                             0, write_mask);
331       exp_num++;
332    }
333 
334    for (int i = 0; i < 2; i++) {
335       if ((outputs_written & (VARYING_BIT_CLIP_DIST0 << i)) &&
336           (clip_cull_mask & BITFIELD_RANGE(i * 4, 4))) {
337          exp[exp_num] = export(
338             b, get_export_output(b, out->outputs[VARYING_SLOT_CLIP_DIST0 + i]), row,
339             V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset, 0,
340             (clip_cull_mask >> (i * 4)) & 0xf);
341          exp_num++;
342       }
343    }
344 
345    if (outputs_written & VARYING_BIT_CLIP_VERTEX) {
346       nir_def *vtx = get_export_output(b, out->outputs[VARYING_SLOT_CLIP_VERTEX]);
347 
348       /* Clip distance for clip vertex to each user clip plane. */
349       nir_def *clip_dist[8] = {0};
350       u_foreach_bit (i, clip_cull_mask) {
351          nir_def *ucp = nir_load_user_clip_plane(b, .ucp_id = i);
352          clip_dist[i] = nir_fdot4(b, vtx, ucp);
353       }
354 
355       for (int i = 0; i < 2; i++) {
356          if (clip_cull_mask & BITFIELD_RANGE(i * 4, 4)) {
357             exp[exp_num] = export(
358                b, get_export_output(b, clip_dist + i * 4), row,
359                V_008DFC_SQ_EXP_POS + exp_num + exp_pos_offset, 0,
360                (clip_cull_mask >> (i * 4)) & 0xf);
361             exp_num++;
362          }
363       }
364    }
365 
366    if (!exp_num)
367       return;
368 
369    nir_intrinsic_instr *final_exp = exp[exp_num - 1];
370 
371    if (done) {
372       /* Specify that this is the last export */
373       const unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
374       nir_intrinsic_set_flags(final_exp, final_exp_flags | AC_EXP_FLAG_DONE);
375    }
376 
377    /* If a shader has no param exports, rasterization can start before
378     * the shader finishes and thus memory stores might not finish before
379     * the pixel shader starts.
380     */
381    if (gfx_level >= GFX10 && no_param_export && b->shader->info.writes_memory) {
382       nir_cursor cursor = b->cursor;
383       b->cursor = nir_before_instr(&final_exp->instr);
384       nir_scoped_memory_barrier(b, SCOPE_DEVICE, NIR_MEMORY_RELEASE,
385                                 nir_var_mem_ssbo | nir_var_mem_global | nir_var_image);
386       b->cursor = cursor;
387    }
388 }
389 
390 void
ac_nir_export_parameters(nir_builder * b,const uint8_t * param_offsets,uint64_t outputs_written,uint16_t outputs_written_16bit,ac_nir_prerast_out * out)391 ac_nir_export_parameters(nir_builder *b,
392                          const uint8_t *param_offsets,
393                          uint64_t outputs_written,
394                          uint16_t outputs_written_16bit,
395                          ac_nir_prerast_out *out)
396 {
397    uint32_t exported_params = 0;
398 
399    u_foreach_bit64 (slot, outputs_written) {
400       unsigned offset = param_offsets[slot];
401       if (offset > AC_EXP_PARAM_OFFSET_31)
402          continue;
403 
404       uint32_t write_mask = 0;
405       for (int i = 0; i < 4; i++) {
406          if (out->outputs[slot][i])
407             write_mask |= (out->infos[slot].as_varying_mask & BITFIELD_BIT(i));
408       }
409 
410       /* no one set this output slot, we can skip the param export */
411       if (!write_mask)
412          continue;
413 
414       /* Since param_offsets[] can map multiple varying slots to the same
415        * param export index (that's radeonsi-specific behavior), we need to
416        * do this so as not to emit duplicated exports.
417        */
418       if (exported_params & BITFIELD_BIT(offset))
419          continue;
420 
421       nir_export_amd(
422          b, get_export_output(b, out->outputs[slot]),
423          .base = V_008DFC_SQ_EXP_PARAM + offset,
424          .write_mask = write_mask);
425       exported_params |= BITFIELD_BIT(offset);
426    }
427 
428    u_foreach_bit (slot, outputs_written_16bit) {
429       unsigned offset = param_offsets[VARYING_SLOT_VAR0_16BIT + slot];
430       if (offset > AC_EXP_PARAM_OFFSET_31)
431          continue;
432 
433       uint32_t write_mask = 0;
434       for (int i = 0; i < 4; i++) {
435          if (out->outputs_16bit_lo[slot][i] || out->outputs_16bit_hi[slot][i])
436             write_mask |= BITFIELD_BIT(i);
437       }
438 
439       /* no one set this output slot, we can skip the param export */
440       if (!write_mask)
441          continue;
442 
443       /* Since param_offsets[] can map multiple varying slots to the same
444        * param export index (that's radeonsi-specific behavior), we need to
445        * do this so as not to emit duplicated exports.
446        */
447       if (exported_params & BITFIELD_BIT(offset))
448          continue;
449 
450       nir_def *vec[4];
451       nir_def *undef = nir_undef(b, 1, 16);
452       for (int i = 0; i < 4; i++) {
453          nir_def *lo = out->outputs_16bit_lo[slot][i] ? out->outputs_16bit_lo[slot][i] : undef;
454          nir_def *hi = out->outputs_16bit_hi[slot][i] ? out->outputs_16bit_hi[slot][i] : undef;
455          vec[i] = nir_pack_32_2x16_split(b, lo, hi);
456       }
457 
458       nir_export_amd(
459          b, nir_vec(b, vec, 4),
460          .base = V_008DFC_SQ_EXP_PARAM + offset,
461          .write_mask = write_mask);
462       exported_params |= BITFIELD_BIT(offset);
463    }
464 }
465 
466 void
ac_nir_store_parameters_to_attr_ring(nir_builder * b,const uint8_t * param_offsets,const uint64_t outputs_written,const uint16_t outputs_written_16bit,ac_nir_prerast_out * out,nir_def * num_export_threads_in_wave)467 ac_nir_store_parameters_to_attr_ring(nir_builder *b,
468                                      const uint8_t *param_offsets,
469                                      const uint64_t outputs_written,
470                                      const uint16_t outputs_written_16bit,
471                                      ac_nir_prerast_out *out,
472                                      nir_def *num_export_threads_in_wave)
473 {
474    nir_def *attr_rsrc = nir_load_ring_attr_amd(b);
475 
476    /* We should always store full vec4s in groups of 8 lanes for the best performance even if
477     * some of them are garbage or have unused components, so align the number of export threads
478     * to 8.
479     */
480    nir_def *num_attr_ring_store_threads = nir_iand_imm(b, nir_iadd_imm(b, num_export_threads_in_wave, 7), ~7);
481 
482    nir_if *if_attr_ring_store = nir_push_if(b, nir_is_subgroup_invocation_lt_amd(b, num_attr_ring_store_threads));
483 
484    nir_def *attr_offset = nir_load_ring_attr_offset_amd(b);
485    nir_def *vindex = nir_load_local_invocation_index(b);
486    nir_def *voffset = nir_imm_int(b, 0);
487    nir_def *undef = nir_undef(b, 1, 32);
488 
489    uint32_t exported_params = 0;
490 
491    u_foreach_bit64 (slot, outputs_written) {
492       const unsigned offset = param_offsets[slot];
493 
494       if (offset > AC_EXP_PARAM_OFFSET_31)
495          continue;
496 
497       if (!out->infos[slot].as_varying_mask)
498          continue;
499 
500       if (exported_params & BITFIELD_BIT(offset))
501          continue;
502 
503       nir_def *comp[4];
504       for (unsigned j = 0; j < 4; j++) {
505          comp[j] = out->outputs[slot][j] ? out->outputs[slot][j] : undef;
506       }
507 
508       nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex,
509                            .base = offset * 16,
510                            .memory_modes = nir_var_shader_out,
511                            .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
512 
513       exported_params |= BITFIELD_BIT(offset);
514    }
515 
516    u_foreach_bit (i, outputs_written_16bit) {
517       const unsigned offset = param_offsets[VARYING_SLOT_VAR0_16BIT + i];
518 
519       if (offset > AC_EXP_PARAM_OFFSET_31)
520          continue;
521 
522       if (!out->infos_16bit_lo[i].as_varying_mask &&
523           !out->infos_16bit_hi[i].as_varying_mask)
524          continue;
525 
526       if (exported_params & BITFIELD_BIT(offset))
527          continue;
528 
529       nir_def *comp[4];
530       for (unsigned j = 0; j < 4; j++) {
531          nir_def *lo = out->outputs_16bit_lo[i][j] ? out->outputs_16bit_lo[i][j] : undef;
532          nir_def *hi = out->outputs_16bit_hi[i][j] ? out->outputs_16bit_hi[i][j] : undef;
533          comp[j] = nir_pack_32_2x16_split(b, lo, hi);
534       }
535 
536       nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex,
537                            .base = offset * 16,
538                            .memory_modes = nir_var_shader_out,
539                            .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
540 
541       exported_params |= BITFIELD_BIT(offset);
542    }
543 
544    nir_pop_if(b, if_attr_ring_store);
545 }
546 
547 static int
sort_xfb(const void * _a,const void * _b)548 sort_xfb(const void *_a, const void *_b)
549 {
550    const nir_xfb_output_info *a = (const nir_xfb_output_info *)_a;
551    const nir_xfb_output_info *b = (const nir_xfb_output_info *)_b;
552 
553    if (a->buffer != b->buffer)
554       return a->buffer > b->buffer ? 1 : -1;
555 
556    assert(a->offset != b->offset);
557    return a->offset > b->offset ? 1 : -1;
558 }
559 
560 /* Return XFB info sorted by buffer and offset, so that we can generate vec4
561  * stores by iterating over outputs only once.
562  */
563 nir_xfb_info *
ac_nir_get_sorted_xfb_info(const nir_shader * nir)564 ac_nir_get_sorted_xfb_info(const nir_shader *nir)
565 {
566    if (!nir->xfb_info)
567       return NULL;
568 
569    unsigned xfb_info_size = nir_xfb_info_size(nir->xfb_info->output_count);
570    nir_xfb_info *info = rzalloc_size(nir, xfb_info_size);
571 
572    memcpy(info, nir->xfb_info, xfb_info_size);
573    qsort(info->outputs, info->output_count, sizeof(info->outputs[0]), sort_xfb);
574    return info;
575 }
576 
577 static nir_def **
get_output_and_type(ac_nir_prerast_out * out,unsigned slot,bool high_16bits,nir_alu_type ** types)578 get_output_and_type(ac_nir_prerast_out *out, unsigned slot, bool high_16bits,
579                     nir_alu_type **types)
580 {
581    nir_def **data;
582    nir_alu_type *type;
583 
584    /* Only VARYING_SLOT_VARn_16BIT slots need output type to convert 16bit output
585     * to 32bit. Vulkan is not allowed to streamout output less than 32bit.
586     */
587    if (slot < VARYING_SLOT_VAR0_16BIT) {
588       data = out->outputs[slot];
589       type = NULL;
590    } else {
591       unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
592 
593       if (high_16bits) {
594          data = out->outputs_16bit_hi[index];
595          type = out->types_16bit_hi[index];
596       } else {
597          data = out->outputs[index];
598          type = out->types_16bit_lo[index];
599       }
600    }
601 
602    *types = type;
603    return data;
604 }
605 
606 void
ac_nir_emit_legacy_streamout(nir_builder * b,unsigned stream,nir_xfb_info * info,ac_nir_prerast_out * out)607 ac_nir_emit_legacy_streamout(nir_builder *b, unsigned stream, nir_xfb_info *info, ac_nir_prerast_out *out)
608 {
609    nir_def *so_vtx_count = nir_ubfe_imm(b, nir_load_streamout_config_amd(b), 16, 7);
610    nir_def *tid = nir_load_subgroup_invocation(b);
611 
612    nir_push_if(b, nir_ilt(b, tid, so_vtx_count));
613    nir_def *so_write_index = nir_load_streamout_write_index_amd(b);
614 
615    nir_def *so_buffers[NIR_MAX_XFB_BUFFERS];
616    nir_def *so_write_offset[NIR_MAX_XFB_BUFFERS];
617    u_foreach_bit(i, info->buffers_written) {
618       so_buffers[i] = nir_load_streamout_buffer_amd(b, i);
619 
620       unsigned stride = info->buffers[i].stride;
621       nir_def *offset = nir_load_streamout_offset_amd(b, i);
622       offset = nir_iadd(b, nir_imul_imm(b, nir_iadd(b, so_write_index, tid), stride),
623                         nir_imul_imm(b, offset, 4));
624       so_write_offset[i] = offset;
625    }
626 
627    nir_def *zero = nir_imm_int(b, 0);
628    unsigned num_values = 0, store_offset = 0, store_buffer_index = 0;
629    nir_def *values[4];
630 
631    for (unsigned i = 0; i < info->output_count; i++) {
632       const nir_xfb_output_info *output = info->outputs + i;
633       if (stream != info->buffer_to_stream[output->buffer])
634          continue;
635 
636       nir_alu_type *output_type;
637       nir_def **output_data =
638          get_output_and_type(out, output->location, output->high_16bits, &output_type);
639 
640       u_foreach_bit(out_comp, output->component_mask) {
641          if (!output_data[out_comp])
642             continue;
643 
644          nir_def *data = output_data[out_comp];
645 
646          if (data->bit_size < 32) {
647             /* Convert the 16-bit output to 32 bits. */
648             assert(output_type);
649 
650             nir_alu_type base_type = nir_alu_type_get_base_type(output_type[out_comp]);
651             data = nir_convert_to_bit_size(b, data, base_type, 32);
652          }
653 
654          assert(out_comp >= output->component_offset);
655          const unsigned store_comp = out_comp - output->component_offset;
656          const unsigned store_comp_offset = output->offset + store_comp * 4;
657          const bool has_hole = store_offset + num_values * 4 != store_comp_offset;
658 
659          /* Flush the gathered components to memory as a vec4 store or less if there is a hole. */
660          if (num_values && (num_values == 4 || store_buffer_index != output->buffer || has_hole)) {
661             nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
662                                  so_write_offset[store_buffer_index], zero, zero,
663                                  .base = store_offset,
664                                  .access = ACCESS_NON_TEMPORAL);
665             num_values = 0;
666          }
667 
668          /* Initialize the buffer index and offset if we are beginning a new vec4 store. */
669          if (num_values == 0) {
670             store_buffer_index = output->buffer;
671             store_offset = store_comp_offset;
672          }
673 
674          values[num_values++] = data;
675       }
676    }
677 
678    if (num_values) {
679       /* Flush the remaining components to memory (as an up to vec4 store) */
680       nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffers[store_buffer_index],
681                            so_write_offset[store_buffer_index], zero, zero,
682                            .base = store_offset,
683                            .access = ACCESS_NON_TEMPORAL);
684    }
685 
686    nir_pop_if(b, NULL);
687 }
688 
689 static nir_def *
ac_nir_accum_ior(nir_builder * b,nir_def * accum_result,nir_def * new_term)690 ac_nir_accum_ior(nir_builder *b, nir_def *accum_result, nir_def *new_term)
691 {
692    return accum_result ? nir_ior(b, accum_result, new_term) : new_term;
693 }
694 
695 bool
ac_nir_gs_shader_query(nir_builder * b,bool has_gen_prim_query,bool has_gs_invocations_query,bool has_gs_primitives_query,unsigned num_vertices_per_primitive,unsigned wave_size,nir_def * vertex_count[4],nir_def * primitive_count[4])696 ac_nir_gs_shader_query(nir_builder *b,
697                        bool has_gen_prim_query,
698                        bool has_gs_invocations_query,
699                        bool has_gs_primitives_query,
700                        unsigned num_vertices_per_primitive,
701                        unsigned wave_size,
702                        nir_def *vertex_count[4],
703                        nir_def *primitive_count[4])
704 {
705    nir_def *pipeline_query_enabled = NULL;
706    nir_def *prim_gen_query_enabled = NULL;
707    nir_def *any_query_enabled = NULL;
708 
709    if (has_gen_prim_query) {
710       prim_gen_query_enabled = nir_load_prim_gen_query_enabled_amd(b);
711       any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, prim_gen_query_enabled);
712    }
713 
714    if (has_gs_invocations_query || has_gs_primitives_query) {
715       pipeline_query_enabled = nir_load_pipeline_stat_query_enabled_amd(b);
716       any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, pipeline_query_enabled);
717    }
718 
719    if (!any_query_enabled) {
720       /* has no query */
721       return false;
722    }
723 
724    nir_if *if_shader_query = nir_push_if(b, any_query_enabled);
725 
726    nir_def *active_threads_mask = nir_ballot(b, 1, wave_size, nir_imm_true(b));
727    nir_def *num_active_threads = nir_bit_count(b, active_threads_mask);
728 
729    /* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
730     * GS emits points, line strips or triangle strips.
731     * Real primitives are points, lines or triangles.
732     */
733    nir_def *num_prims_in_wave[4] = {0};
734    u_foreach_bit (i, b->shader->info.gs.active_stream_mask) {
735       assert(vertex_count[i] && primitive_count[i]);
736 
737       nir_scalar vtx_cnt = nir_get_scalar(vertex_count[i], 0);
738       nir_scalar prm_cnt = nir_get_scalar(primitive_count[i], 0);
739 
740       if (nir_scalar_is_const(vtx_cnt) && nir_scalar_is_const(prm_cnt)) {
741          unsigned gs_vtx_cnt = nir_scalar_as_uint(vtx_cnt);
742          unsigned gs_prm_cnt = nir_scalar_as_uint(prm_cnt);
743          unsigned total_prm_cnt = gs_vtx_cnt - gs_prm_cnt * (num_vertices_per_primitive - 1u);
744          if (total_prm_cnt == 0)
745             continue;
746 
747          num_prims_in_wave[i] = nir_imul_imm(b, num_active_threads, total_prm_cnt);
748       } else {
749          nir_def *gs_vtx_cnt = vtx_cnt.def;
750          nir_def *gs_prm_cnt = prm_cnt.def;
751          if (num_vertices_per_primitive > 1)
752             gs_prm_cnt = nir_iadd(b, nir_imul_imm(b, gs_prm_cnt, -1u * (num_vertices_per_primitive - 1)), gs_vtx_cnt);
753          num_prims_in_wave[i] = nir_reduce(b, gs_prm_cnt, .reduction_op = nir_op_iadd);
754       }
755    }
756 
757    /* Store the query result to query result using an atomic add. */
758    nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
759    {
760       if (has_gs_invocations_query || has_gs_primitives_query) {
761          nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
762          {
763             nir_def *count = NULL;
764 
765             /* Add all streams' number to the same counter. */
766             for (int i = 0; i < 4; i++) {
767                if (num_prims_in_wave[i]) {
768                   if (count)
769                      count = nir_iadd(b, count, num_prims_in_wave[i]);
770                   else
771                      count = num_prims_in_wave[i];
772                }
773             }
774 
775             if (has_gs_primitives_query && count)
776                nir_atomic_add_gs_emit_prim_count_amd(b, count);
777 
778             if (has_gs_invocations_query)
779                nir_atomic_add_shader_invocation_count_amd(b, num_active_threads);
780          }
781          nir_pop_if(b, if_pipeline_query);
782       }
783 
784       if (has_gen_prim_query) {
785          nir_if *if_prim_gen_query = nir_push_if(b, prim_gen_query_enabled);
786          {
787             /* Add to the counter for this stream. */
788             for (int i = 0; i < 4; i++) {
789                if (num_prims_in_wave[i])
790                   nir_atomic_add_gen_prim_count_amd(b, num_prims_in_wave[i], .stream_id = i);
791             }
792          }
793          nir_pop_if(b, if_prim_gen_query);
794       }
795    }
796    nir_pop_if(b, if_first_lane);
797 
798    nir_pop_if(b, if_shader_query);
799    return true;
800 }
801 
802 nir_def *
ac_nir_pack_ngg_prim_exp_arg(nir_builder * b,unsigned num_vertices_per_primitives,nir_def * vertex_indices[3],nir_def * is_null_prim,enum amd_gfx_level gfx_level)803 ac_nir_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
804                              nir_def *vertex_indices[3], nir_def *is_null_prim,
805                              enum amd_gfx_level gfx_level)
806 {
807    nir_def *arg = nir_load_initial_edgeflags_amd(b);
808 
809    for (unsigned i = 0; i < num_vertices_per_primitives; ++i) {
810       assert(vertex_indices[i]);
811       arg = nir_ior(b, arg, nir_ishl_imm(b, vertex_indices[i],
812                                          (gfx_level >= GFX12 ? 9u : 10u) * i));
813    }
814 
815    if (is_null_prim) {
816       if (is_null_prim->bit_size == 1)
817          is_null_prim = nir_b2i32(b, is_null_prim);
818       assert(is_null_prim->bit_size == 32);
819       arg = nir_ior(b, arg, nir_ishl_imm(b, is_null_prim, 31u));
820    }
821 
822    return arg;
823 }
824 
825 void
ac_nir_clamp_vertex_color_outputs(nir_builder * b,ac_nir_prerast_out * out)826 ac_nir_clamp_vertex_color_outputs(nir_builder *b, ac_nir_prerast_out *out)
827 {
828    /* Clamp color outputs. */
829    if (!(b->shader->info.outputs_written & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
830                                             VARYING_BIT_BFC0 | VARYING_BIT_BFC1)))
831       return;
832 
833    nir_def *color_channels[16] = {0};
834 
835    nir_if *if_clamp = nir_push_if(b, nir_load_clamp_vertex_color_amd(b));
836    {
837       for (unsigned i = 0; i < 16; i++) {
838          const unsigned slot = (i / 8 ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + (i % 8) / 4;
839          if (out->outputs[slot][i % 4])
840             color_channels[i] = nir_fsat(b, out->outputs[slot][i % 4]);
841       }
842    }
843    nir_pop_if(b, if_clamp);
844    for (unsigned i = 0; i < 16; i++) {
845       if (color_channels[i]) {
846          const unsigned slot = (i / 8 ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + (i % 8) / 4;
847          out->outputs[slot][i % 4] = nir_if_phi(b, color_channels[i], out->outputs[slot][i % 4]);
848       }
849    }
850 }
851 
852 static void
ac_nir_ngg_alloc_vertices_fully_culled_workaround(nir_builder * b,nir_def * num_vtx,nir_def * num_prim)853 ac_nir_ngg_alloc_vertices_fully_culled_workaround(nir_builder *b,
854                                                   nir_def *num_vtx,
855                                                   nir_def *num_prim)
856 {
857    /* HW workaround for a GPU hang with 100% culling on GFX10.
858     * We always have to export at least 1 primitive.
859     * Export a degenerate triangle using vertex 0 for all 3 vertices.
860     *
861     * NOTE: We rely on the caller to set the vertex count also to 0 when the primitive count is 0.
862     */
863    nir_def *is_prim_cnt_0 = nir_ieq_imm(b, num_prim, 0);
864    nir_if *if_prim_cnt_0 = nir_push_if(b, is_prim_cnt_0);
865    {
866       nir_def *one = nir_imm_int(b, 1);
867       nir_sendmsg_amd(b, nir_ior(b, nir_ishl_imm(b, one, 12), one), .base = AC_SENDMSG_GS_ALLOC_REQ);
868 
869       nir_def *tid = nir_load_subgroup_invocation(b);
870       nir_def *is_thread_0 = nir_ieq_imm(b, tid, 0);
871       nir_if *if_thread_0 = nir_push_if(b, is_thread_0);
872       {
873          /* The vertex indices are 0, 0, 0. */
874          nir_export_amd(b, nir_imm_zero(b, 4, 32),
875                         .base = V_008DFC_SQ_EXP_PRIM,
876                         .flags = AC_EXP_FLAG_DONE,
877                         .write_mask = 1);
878 
879          /* The HW culls primitives with NaN. -1 is also NaN and can save
880           * a dword in binary code by inlining constant.
881           */
882          nir_export_amd(b, nir_imm_ivec4(b, -1, -1, -1, -1),
883                         .base = V_008DFC_SQ_EXP_POS,
884                         .flags = AC_EXP_FLAG_DONE,
885                         .write_mask = 0xf);
886       }
887       nir_pop_if(b, if_thread_0);
888    }
889    nir_push_else(b, if_prim_cnt_0);
890    {
891       nir_sendmsg_amd(b, nir_ior(b, nir_ishl_imm(b, num_prim, 12), num_vtx), .base = AC_SENDMSG_GS_ALLOC_REQ);
892    }
893    nir_pop_if(b, if_prim_cnt_0);
894 }
895 
896 /**
897  * Emits code for allocating space for vertices and primitives for NGG shaders.
898  * The caller should only call this conditionally on wave 0.
899  * When either the vertex or primitive count is 0, both should be set to 0.
900  */
901 void
ac_nir_ngg_alloc_vertices_and_primitives(nir_builder * b,nir_def * num_vtx,nir_def * num_prim,bool fully_culled_workaround)902 ac_nir_ngg_alloc_vertices_and_primitives(nir_builder *b,
903                                          nir_def *num_vtx,
904                                          nir_def *num_prim,
905                                          bool fully_culled_workaround)
906 {
907    if (fully_culled_workaround) {
908       ac_nir_ngg_alloc_vertices_fully_culled_workaround(b, num_vtx, num_prim);
909       return;
910    }
911 
912    /* Send GS Alloc Request message from the first wave of the group to SPI.
913     * Message payload (in the m0 register) is:
914     * - bits 0..10: number of vertices in group
915     * - bits 12..22: number of primitives in group
916     */
917    nir_sendmsg_amd(b, nir_ior(b, nir_ishl_imm(b, num_prim, 12), num_vtx), .base = AC_SENDMSG_GS_ALLOC_REQ);
918 }
919 
920 void
ac_nir_create_output_phis(nir_builder * b,const uint64_t outputs_written,const uint64_t outputs_written_16bit,ac_nir_prerast_out * out)921 ac_nir_create_output_phis(nir_builder *b,
922                           const uint64_t outputs_written,
923                           const uint64_t outputs_written_16bit,
924                           ac_nir_prerast_out *out)
925 {
926    nir_def *undef = nir_undef(b, 1, 32); /* inserted at the start of the shader */
927 
928    u_foreach_bit64(slot, outputs_written) {
929       for (unsigned j = 0; j < 4; j++) {
930          if (out->outputs[slot][j])
931             out->outputs[slot][j] = nir_if_phi(b, out->outputs[slot][j], undef);
932       }
933    }
934 
935    u_foreach_bit64(i, outputs_written_16bit) {
936       for (unsigned j = 0; j < 4; j++) {
937          if (out->outputs_16bit_hi[i][j])
938             out->outputs_16bit_hi[i][j] = nir_if_phi(b, out->outputs_16bit_hi[i][j], undef);
939 
940          if (out->outputs_16bit_lo[i][j])
941             out->outputs_16bit_lo[i][j] = nir_if_phi(b, out->outputs_16bit_lo[i][j], undef);
942       }
943    }
944 }
945 
946 static nir_def *
write_values_to_lanes(nir_builder * b,nir_def ** values,unsigned lane_mask)947 write_values_to_lanes(nir_builder *b, nir_def **values, unsigned lane_mask)
948 {
949    nir_def *lanes = nir_imm_int(b, 0);
950 
951    u_foreach_bit(i, lane_mask) {
952       lanes = nir_write_invocation_amd(b, lanes, values[i], nir_imm_int(b, i));
953    }
954    return lanes;
955 }
956 
957 static nir_def *
read_values_from_4_lanes(nir_builder * b,nir_def * values,unsigned lane_mask)958 read_values_from_4_lanes(nir_builder *b, nir_def *values, unsigned lane_mask)
959 {
960    nir_def *undef = nir_undef(b, 1, 32);
961    nir_def *per_lane[4] = {undef, undef, undef, undef};
962 
963    u_foreach_bit(i, lane_mask) {
964       per_lane[i] = nir_read_invocation(b, values, nir_imm_int(b, i));
965    }
966    return nir_vec(b, per_lane, 4);
967 }
968 
969 void
ac_nir_ngg_build_streamout_buffer_info(nir_builder * b,nir_xfb_info * info,enum amd_gfx_level gfx_level,bool has_xfb_prim_query,bool use_gfx12_xfb_intrinsic,nir_def * scratch_base,nir_def * tid_in_tg,nir_def * gen_prim[4],nir_def * so_buffer_ret[4],nir_def * buffer_offsets_ret[4],nir_def * emit_prim_ret[4])970 ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
971                                        nir_xfb_info *info,
972                                        enum amd_gfx_level gfx_level,
973                                        bool has_xfb_prim_query,
974                                        bool use_gfx12_xfb_intrinsic,
975                                        nir_def *scratch_base,
976                                        nir_def *tid_in_tg,
977                                        nir_def *gen_prim[4],
978                                        nir_def *so_buffer_ret[4],
979                                        nir_def *buffer_offsets_ret[4],
980                                        nir_def *emit_prim_ret[4])
981 {
982    nir_def *prim_stride[4] = {0};
983    nir_def *undef = nir_undef(b, 1, 32);
984 
985    /* For radeonsi which pass this value by arg when VS. Streamout need accurate
986     * num-vert-per-prim for writing correct amount of data to buffer.
987     */
988    nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b);
989    for (unsigned buffer = 0; buffer < 4; buffer++) {
990       if (!(info->buffers_written & BITFIELD_BIT(buffer)))
991          continue;
992 
993       assert(info->buffers[buffer].stride);
994 
995       prim_stride[buffer] =
996          nir_imul_imm(b, num_vert_per_prim, info->buffers[buffer].stride);
997       so_buffer_ret[buffer] = nir_load_streamout_buffer_amd(b, .base = buffer);
998    }
999 
1000    nir_if *if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0));
1001    {
1002       nir_def *any_buffer_valid = nir_imm_false(b);
1003       nir_def *workgroup_buffer_sizes[4];
1004 
1005       for (unsigned buffer = 0; buffer < 4; buffer++) {
1006          if (info->buffers_written & BITFIELD_BIT(buffer)) {
1007             nir_def *buffer_size = nir_channel(b, so_buffer_ret[buffer], 2);
1008             /* In radeonsi, we may not know if a feedback buffer has been bound when
1009              * compile time, so have to check buffer size in runtime to disable the
1010              * GDS update for unbind buffer to prevent the case that previous draw
1011              * compiled with streamout but does not bind feedback buffer miss update
1012              * GDS which will affect current draw's streamout.
1013              */
1014             nir_def *buffer_valid = nir_ine_imm(b, buffer_size, 0);
1015             nir_def *inc_buffer_size =
1016                nir_imul(b, gen_prim[info->buffer_to_stream[buffer]], prim_stride[buffer]);
1017             workgroup_buffer_sizes[buffer] =
1018                nir_bcsel(b, buffer_valid, inc_buffer_size, nir_imm_int(b, 0));
1019             any_buffer_valid = nir_ior(b, any_buffer_valid, buffer_valid);
1020          } else
1021             workgroup_buffer_sizes[buffer] = undef;
1022       }
1023 
1024       nir_def *buffer_offsets = NULL, *xfb_state_address = NULL, *xfb_voffset = NULL;
1025 
1026       /* Get current global offset of buffer and increase by amount of
1027        * workgroup buffer size. This is an ordered operation sorted by
1028        * ordered_id; Each buffer info is in a channel of a vec4.
1029        */
1030       if (gfx_level >= GFX12) {
1031          nir_pop_if(b, if_invocation_0);
1032 
1033          for (unsigned buffer = 0; buffer < 4; buffer++)
1034             workgroup_buffer_sizes[buffer] = nir_if_phi(b, workgroup_buffer_sizes[buffer], undef);
1035          any_buffer_valid = nir_if_phi(b, any_buffer_valid, nir_undef(b, 1, 1));
1036 
1037          /* These must be set after nir_pop_if and phis. */
1038          xfb_state_address = nir_load_xfb_state_address_gfx12_amd(b);
1039          xfb_voffset = nir_imul_imm(b, tid_in_tg, 8);
1040 
1041          nir_if *if_4lanes = nir_push_if(b, nir_iand(b, any_buffer_valid, nir_ult_imm(b, tid_in_tg, 4)));
1042          {
1043             /* Move workgroup buffer sizes from SGPRs to the first 4 lanes. */
1044             nir_def *workgroup_buffer_size_per_lane =
1045                write_values_to_lanes(b, workgroup_buffer_sizes, info->buffers_written);
1046             nir_def *ordered_id = nir_load_ordered_id_amd(b);
1047 
1048             /* The atomic value for the 4 lanes is:
1049              *    lane 0: uvec2(ordered_id, workgroup_buffer_size0)
1050              *    lane 1: uvec2(ordered_id, workgroup_buffer_size1)
1051              *    lane 2: uvec2(ordered_id, workgroup_buffer_size2)
1052              *    lane 3: uvec2(ordered_id, workgroup_buffer_size3)
1053              */
1054             nir_def *atomic_src = nir_pack_64_2x32_split(b, ordered_id,
1055                                                          workgroup_buffer_size_per_lane);
1056 
1057             /* The memory layout of the xfb state is:
1058              *    struct {
1059              *       unsigned ordered_id;
1060              *       unsigned dwords_written0;
1061              *       unsigned ordered_id;
1062              *       unsigned dwords_written1;
1063              *       unsigned ordered_id;
1064              *       unsigned dwords_written2;
1065              *       unsigned ordered_id;
1066              *       unsigned dwords_written3;
1067              *    };
1068              *
1069              * Notes:
1070              * - global_atomic_ordered_add_b64 is semantically a 64-bit atomic, requiring 8-byte
1071              *   address alignment, even though it operates on a pair of 32-bit values.
1072              * - The whole structure is updated at once by issuing the atomic from 4 lanes
1073              *   with 8-byte address increments.
1074              * - The whole structure should be entirely within one 64B block of memory
1075              *   for performance. (the address bits above 64B should not differ between lanes)
1076              */
1077             nir_def *buffer_offset_per_lane;
1078 
1079             /* The gfx12 intrinsic inserts hand-written assembly producing better code than current
1080              * LLVM.
1081              */
1082             if (use_gfx12_xfb_intrinsic) {
1083                buffer_offset_per_lane =
1084                   nir_ordered_add_loop_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id,
1085                                                  atomic_src);
1086 
1087                /* Move the buffer offsets from the 4 lanes to lane 0. */
1088                buffer_offsets = read_values_from_4_lanes(b, buffer_offset_per_lane, info->buffers_written);
1089             } else {
1090                /* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */
1091                enum { NUM_ATOMICS_IN_FLIGHT = 6 };
1092 
1093                nir_variable *result_ring[NUM_ATOMICS_IN_FLIGHT] = {0};
1094                for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT; i++)
1095                   result_ring[i] = nir_local_variable_create(b->impl, glsl_uint64_t_type(), "result");
1096 
1097                /* Issue the first N-1 atomics. The shader must not wait because we want them to be
1098                 * pipelined. It will only wait for the oldest atomic in the NIR loop.
1099                 */
1100                for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT - 1; i++) {
1101                   nir_store_var(b, result_ring[i],
1102                                 nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset,
1103                                                       .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1);
1104                   ac_nir_sleep(b, 24);
1105                }
1106 
1107                nir_variable *buffer_offsets_var =
1108                   nir_local_variable_create(b->impl, glsl_vec4_type(), "buffer_offset_per_lane");
1109 
1110                nir_loop *loop = nir_push_loop(b);
1111                {
1112                   for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT; i++) {
1113                      int issue_index = (NUM_ATOMICS_IN_FLIGHT - 1 + i) % NUM_ATOMICS_IN_FLIGHT;
1114                      int read_index = i;
1115 
1116                      /* Issue (or repeat) the atomic. */
1117                      nir_store_var(b, result_ring[issue_index],
1118                                    nir_global_atomic_amd(b, 64, xfb_state_address, atomic_src, xfb_voffset,
1119                                                          .atomic_op = nir_atomic_op_ordered_add_gfx12_amd), 0x1);
1120 
1121                      /* Break if the oldest atomic succeeded in incrementing the offsets. */
1122                      nir_def *oldest_result = nir_load_var(b, result_ring[read_index]);
1123                      nir_def *loaded_ordered_id = nir_unpack_64_2x32_split_x(b, oldest_result);
1124 
1125                      /* Debug: Write the vec4 into a shader log ring buffer. */
1126 #if 0
1127                      nir_def *loaded_dwords_written = nir_unpack_64_2x32_split_y(b, oldest_result);
1128                      ac_nir_store_debug_log_amd(b, nir_vec4(b, nir_u2u32(b, xfb_state_address),
1129                                                             ordered_id, loaded_ordered_id,
1130                                                             loaded_dwords_written));
1131 #endif
1132 
1133                      nir_def *continue_if = nir_ieq(b, loaded_ordered_id, ordered_id);
1134                      continue_if = nir_inot(b, nir_vote_any(b, 1, continue_if));
1135                      nir_push_if(b, continue_if);
1136                   }
1137                   nir_jump(b, nir_jump_continue);
1138 
1139                   for (unsigned i = 0; i < NUM_ATOMICS_IN_FLIGHT; i++) {
1140                      int read_index = NUM_ATOMICS_IN_FLIGHT - 1 - i;
1141                      nir_push_else(b, NULL);
1142                      {
1143                         nir_def *result = nir_load_var(b, result_ring[read_index]);
1144                         buffer_offset_per_lane = nir_unpack_64_2x32_split_y(b, result);
1145                         buffer_offsets = read_values_from_4_lanes(b, buffer_offset_per_lane, info->buffers_written);
1146                         nir_store_var(b, buffer_offsets_var, buffer_offsets, info->buffers_written);
1147                      }
1148                      nir_pop_if(b, NULL);
1149                   }
1150                   nir_jump(b, nir_jump_break);
1151                }
1152                nir_pop_loop(b, loop);
1153                buffer_offsets = nir_load_var(b, buffer_offsets_var);
1154             }
1155          }
1156          nir_pop_if(b, if_4lanes);
1157          buffer_offsets = nir_if_phi(b, buffer_offsets, nir_undef(b, 4, 32));
1158 
1159          if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0));
1160       } else {
1161          nir_def *ordered_id = nir_load_ordered_id_amd(b);
1162          buffer_offsets =
1163             nir_ordered_xfb_counter_add_gfx11_amd(b, ordered_id,
1164                                                   nir_vec(b, workgroup_buffer_sizes, 4),
1165                                                   /* mask of buffers to update */
1166                                                   .write_mask = info->buffers_written);
1167       }
1168 
1169       nir_def *emit_prim[4];
1170       memcpy(emit_prim, gen_prim, 4 * sizeof(nir_def *));
1171 
1172       nir_def *any_overflow = nir_imm_false(b);
1173       nir_def *overflow_amount[4] = {undef, undef, undef, undef};
1174 
1175       for (unsigned buffer = 0; buffer < 4; buffer++) {
1176          if (!(info->buffers_written & BITFIELD_BIT(buffer)))
1177             continue;
1178 
1179          nir_def *buffer_size = nir_channel(b, so_buffer_ret[buffer], 2);
1180 
1181          /* Only consider overflow for valid feedback buffers because
1182           * otherwise the ordered operation above (GDS atomic return) might
1183           * return non-zero offsets for invalid buffers.
1184           */
1185          nir_def *buffer_valid = nir_ine_imm(b, buffer_size, 0);
1186          nir_def *buffer_offset = nir_channel(b, buffer_offsets, buffer);
1187          buffer_offset = nir_bcsel(b, buffer_valid, buffer_offset, nir_imm_int(b, 0));
1188 
1189          nir_def *remain_size = nir_isub(b, buffer_size, buffer_offset);
1190          nir_def *remain_prim = nir_idiv(b, remain_size, prim_stride[buffer]);
1191          nir_def *overflow = nir_ilt(b, buffer_size, buffer_offset);
1192 
1193          any_overflow = nir_ior(b, any_overflow, overflow);
1194          overflow_amount[buffer] = nir_imax(b, nir_imm_int(b, 0),
1195                                             nir_isub(b, buffer_offset, buffer_size));
1196 
1197          unsigned stream = info->buffer_to_stream[buffer];
1198          /* when previous workgroup overflow, we can't emit any primitive */
1199          emit_prim[stream] = nir_bcsel(
1200             b, overflow, nir_imm_int(b, 0),
1201             /* we can emit part primitives, limited by smallest buffer */
1202             nir_imin(b, emit_prim[stream], remain_prim));
1203 
1204          /* Save to LDS for being accessed by other waves in this workgroup. */
1205          nir_store_shared(b, buffer_offset, scratch_base, .base = buffer * 4);
1206       }
1207 
1208       /* We have to fix up the streamout offsets if we overflowed because they determine
1209        * the vertex count for DrawTransformFeedback.
1210        */
1211       if (gfx_level >= GFX12) {
1212          nir_pop_if(b, if_invocation_0);
1213 
1214          any_overflow = nir_if_phi(b, any_overflow, nir_undef(b, 1, 1));
1215          for (unsigned buffer = 0; buffer < 4; buffer++)
1216             overflow_amount[buffer] = nir_if_phi(b, overflow_amount[buffer], undef);
1217          for (unsigned stream = 0; stream < 4; stream++) {
1218             if (emit_prim[stream])
1219                emit_prim[stream] = nir_if_phi(b, emit_prim[stream], undef);
1220          }
1221 
1222          nir_if *if_any_overflow_4_lanes =
1223             nir_push_if(b, nir_iand(b, any_overflow, nir_ult_imm(b, tid_in_tg, 4)));
1224          {
1225             /* Move overflow amounts from SGPRs to the first 4 lanes. */
1226             nir_def *overflow_amount_per_lane =
1227                write_values_to_lanes(b, overflow_amount, info->buffers_written);
1228 
1229             nir_global_atomic_amd(b, 32, xfb_state_address, nir_ineg(b, overflow_amount_per_lane),
1230                                   xfb_voffset, .base = 4, .atomic_op = nir_atomic_op_iadd);
1231          }
1232          nir_pop_if(b, if_any_overflow_4_lanes);
1233 
1234          if_invocation_0 = nir_push_if(b, nir_ieq_imm(b, tid_in_tg, 0));
1235       } else {
1236          nir_if *if_any_overflow = nir_push_if(b, any_overflow);
1237          nir_xfb_counter_sub_gfx11_amd(b, nir_vec(b, overflow_amount, 4),
1238                                        /* mask of buffers to update */
1239                                        .write_mask = info->buffers_written);
1240          nir_pop_if(b, if_any_overflow);
1241       }
1242 
1243       /* Save to LDS for being accessed by other waves in this workgroup. */
1244       for (unsigned stream = 0; stream < 4; stream++) {
1245          if (!(info->streams_written & BITFIELD_BIT(stream)))
1246             continue;
1247 
1248          nir_store_shared(b, emit_prim[stream], scratch_base, .base = 16 + stream * 4);
1249       }
1250 
1251       /* Update shader query. */
1252       if (has_xfb_prim_query) {
1253          nir_if *if_shader_query = nir_push_if(b, nir_load_prim_xfb_query_enabled_amd(b));
1254          {
1255             for (unsigned stream = 0; stream < 4; stream++) {
1256                if (info->streams_written & BITFIELD_BIT(stream))
1257                   nir_atomic_add_xfb_prim_count_amd(b, emit_prim[stream], .stream_id = stream);
1258             }
1259          }
1260          nir_pop_if(b, if_shader_query);
1261       }
1262    }
1263    nir_pop_if(b, if_invocation_0);
1264 
1265    nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
1266                       .memory_scope = SCOPE_WORKGROUP,
1267                       .memory_semantics = NIR_MEMORY_ACQ_REL,
1268                       .memory_modes = nir_var_mem_shared);
1269 
1270    /* Fetch the per-buffer offsets in all waves. */
1271    for (unsigned buffer = 0; buffer < 4; buffer++) {
1272       if (!(info->buffers_written & BITFIELD_BIT(buffer)))
1273          continue;
1274 
1275       buffer_offsets_ret[buffer] =
1276          nir_load_shared(b, 1, 32, scratch_base, .base = buffer * 4);
1277    }
1278 
1279    /* Fetch the per-stream emit prim in all waves. */
1280    for (unsigned stream = 0; stream < 4; stream++) {
1281       if (!(info->streams_written & BITFIELD_BIT(stream)))
1282             continue;
1283 
1284       emit_prim_ret[stream] =
1285          nir_load_shared(b, 1, 32, scratch_base, .base = 16 + stream * 4);
1286    }
1287 }
1288 
1289 void
ac_nir_ngg_build_streamout_vertex(nir_builder * b,nir_xfb_info * info,unsigned stream,nir_def * so_buffer[4],nir_def * buffer_offsets[4],unsigned vertex_index,nir_def * vtx_lds_addr,ac_nir_prerast_out * pr_out,bool skip_primitive_id)1290 ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
1291                                   unsigned stream, nir_def *so_buffer[4],
1292                                   nir_def *buffer_offsets[4],
1293                                   unsigned vertex_index, nir_def *vtx_lds_addr,
1294                                   ac_nir_prerast_out *pr_out,
1295                                   bool skip_primitive_id)
1296 {
1297    unsigned vertex_offset[NIR_MAX_XFB_BUFFERS] = {0};
1298 
1299    u_foreach_bit(buffer, info->buffers_written) {
1300       /* We use imm_offset for the vertex offset within a primitive, and GFX11 only supports
1301        * 12-bit unsigned imm_offset. (GFX12 supports 24-bit signed imm_offset)
1302        */
1303       assert(info->buffers[buffer].stride * 3 < 4096);
1304       vertex_offset[buffer] = vertex_index * info->buffers[buffer].stride;
1305    }
1306 
1307    nir_def *zero = nir_imm_int(b, 0);
1308    unsigned num_values = 0, store_offset = 0, store_buffer_index = 0;
1309    nir_def *values[4];
1310 
1311    for (unsigned i = 0; i < info->output_count; i++) {
1312       nir_xfb_output_info *out = info->outputs + i;
1313       if (!out->component_mask || info->buffer_to_stream[out->buffer] != stream)
1314          continue;
1315 
1316       unsigned base;
1317       if (out->location >= VARYING_SLOT_VAR0_16BIT) {
1318          base =
1319             util_bitcount64(b->shader->info.outputs_written) +
1320             util_bitcount(b->shader->info.outputs_written_16bit &
1321                           BITFIELD_MASK(out->location - VARYING_SLOT_VAR0_16BIT));
1322       } else {
1323          uint64_t outputs_written = b->shader->info.outputs_written;
1324          if (skip_primitive_id)
1325             outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
1326 
1327          base =
1328             util_bitcount64(outputs_written &
1329                             BITFIELD64_MASK(out->location));
1330       }
1331 
1332       unsigned offset = (base * 4 + out->component_offset) * 4;
1333       unsigned count = util_bitcount(out->component_mask);
1334 
1335       assert(u_bit_consecutive(out->component_offset, count) == out->component_mask);
1336 
1337       nir_def *out_data =
1338          nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset);
1339 
1340       for (unsigned comp = 0; comp < count; comp++) {
1341          nir_def *data = nir_channel(b, out_data, comp);
1342 
1343          /* Convert 16-bit outputs to 32-bit.
1344           *
1345           * OpenGL ES will put 16-bit medium precision varyings to VARYING_SLOT_VAR0_16BIT.
1346           * We need to convert them to 32-bit for streamout.
1347           *
1348           * Vulkan does not allow 8/16bit varyings for streamout.
1349           */
1350          if (out->location >= VARYING_SLOT_VAR0_16BIT) {
1351             unsigned index = out->location - VARYING_SLOT_VAR0_16BIT;
1352             unsigned c = out->component_offset + comp;
1353             nir_def *v;
1354             nir_alu_type t;
1355 
1356             if (out->high_16bits) {
1357                v = nir_unpack_32_2x16_split_y(b, data);
1358                t = pr_out->types_16bit_hi[index][c];
1359             } else {
1360                v = nir_unpack_32_2x16_split_x(b, data);
1361                t = pr_out->types_16bit_lo[index][c];
1362             }
1363 
1364             t = nir_alu_type_get_base_type(t);
1365             data = nir_convert_to_bit_size(b, v, t, 32);
1366          }
1367 
1368          const unsigned store_comp_offset = out->offset + comp * 4;
1369          const bool has_hole = store_offset + num_values * 4 != store_comp_offset;
1370 
1371          /* Flush the gathered components to memory as a vec4 store or less if there is a hole. */
1372          if (num_values && (num_values == 4 || store_buffer_index != out->buffer || has_hole)) {
1373             nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffer[store_buffer_index],
1374                                  buffer_offsets[store_buffer_index], zero, zero,
1375                                  .base = vertex_offset[store_buffer_index] + store_offset,
1376                                  .access = ACCESS_NON_TEMPORAL);
1377             num_values = 0;
1378          }
1379 
1380          /* Initialize the buffer index and offset if we are beginning a new vec4 store. */
1381          if (num_values == 0) {
1382             store_buffer_index = out->buffer;
1383             store_offset = store_comp_offset;
1384          }
1385 
1386          values[num_values++] = data;
1387       }
1388    }
1389 
1390    if (num_values) {
1391       /* Flush the remaining components to memory (as an up to vec4 store) */
1392       nir_store_buffer_amd(b, nir_vec(b, values, num_values), so_buffer[store_buffer_index],
1393                            buffer_offsets[store_buffer_index], zero, zero,
1394                            .base = vertex_offset[store_buffer_index] + store_offset,
1395                            .access = ACCESS_NON_TEMPORAL);
1396    }
1397 }
1398