• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir.h"
8 #include "ac_nir_helpers.h"
9 #include "ac_gpu_info.h"
10 #include "amdgfxregs.h"
11 #include "nir_builder.h"
12 #include "nir_xfb_info.h"
13 #include "util/u_math.h"
14 #include "util/u_vector.h"
15 
16 enum {
17    nggc_passflag_used_by_pos = 1,
18    nggc_passflag_used_by_other = 2,
19    nggc_passflag_used_by_both = nggc_passflag_used_by_pos | nggc_passflag_used_by_other,
20 };
21 
22 typedef struct
23 {
24    nir_def *ssa;
25    nir_variable *var;
26 } reusable_nondeferred_variable;
27 
28 typedef struct
29 {
30    const ac_nir_lower_ngg_options *options;
31 
32    nir_variable *position_value_var;
33    nir_variable *prim_exp_arg_var;
34 
35    /**
36     * Whether the current invocation's vertex (if any) is accepted by the culling algorithm.
37     * Only used when culling is enabled.
38     */
39    nir_variable *es_accepted_var;
40 
41    /**
42     * hether the current invocation's primitive (if any) is accepted by the culling algorithm.
43     * Only used when culling is enabled.
44     */
45    nir_variable *gs_accepted_var;
46 
47    /**
48     * Whether the current invocation's primitive (if any) should be exported.
49     * Initially set to whether the invocation has a vertex, then set to false by the culling
50     * algorithm if the primitive is rejected.
51     */
52    nir_variable *gs_exported_var;
53 
54    nir_variable *gs_vtx_indices_vars[3];
55 
56    nir_def *vtx_addr[3];
57 
58    struct u_vector reusable_nondeferred_variables;
59 
60    bool early_prim_export;
61    bool streamout_enabled;
62    bool has_user_edgeflags;
63    bool skip_primitive_id;
64    unsigned max_num_waves;
65 
66    /* LDS params */
67    unsigned pervertex_lds_bytes;
68 
69    uint64_t inputs_needed_by_pos;
70    uint64_t inputs_needed_by_others;
71 
72    nir_instr *compact_arg_stores[4];
73    nir_intrinsic_instr *overwrite_args;
74    nir_variable *repacked_rel_patch_id;
75 
76    /* clip distance */
77    nir_variable *clip_vertex_var;
78    nir_variable *clipdist_neg_mask_var;
79    bool has_clipdist;
80 
81    /* outputs */
82    ac_nir_prerast_out out;
83 } lower_ngg_nogs_state;
84 
85 /* Per-vertex LDS layout of culling shaders */
86 enum {
87    /* Position of the ES vertex (at the beginning for alignment reasons) */
88    lds_es_pos_x = 0,
89    lds_es_pos_y = 4,
90    lds_es_pos_z = 8,
91    lds_es_pos_w = 12,
92 
93    /* 1 when the vertex is accepted, 0 if it should be culled */
94    lds_es_vertex_accepted = 16,
95    /* ID of the thread which will export the current thread's vertex */
96    lds_es_exporter_tid = 17,
97    /* bit i is set when the i'th clip distance of a vertex is negative */
98    lds_es_clipdist_neg_mask = 18,
99    /* TES only, relative patch ID, less than max workgroup size */
100    lds_es_tes_rel_patch_id = 19,
101 
102    /* Repacked arguments - also listed separately for VS and TES */
103    lds_es_arg_0 = 20,
104 };
105 
106 static nir_def *
pervertex_lds_addr(nir_builder * b,nir_def * vertex_idx,unsigned per_vtx_bytes)107 pervertex_lds_addr(nir_builder *b, nir_def *vertex_idx, unsigned per_vtx_bytes)
108 {
109    return nir_imul_imm(b, vertex_idx, per_vtx_bytes);
110 }
111 
112 static void
ngg_nogs_init_vertex_indices_vars(nir_builder * b,nir_function_impl * impl,lower_ngg_nogs_state * s)113 ngg_nogs_init_vertex_indices_vars(nir_builder *b, nir_function_impl *impl, lower_ngg_nogs_state *s)
114 {
115    for (unsigned v = 0; v < s->options->num_vertices_per_primitive; ++v) {
116       s->gs_vtx_indices_vars[v] = nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx_addr");
117 
118       nir_def *vtx;
119 
120       if (s->options->hw_info->gfx_level >= GFX12) {
121          vtx = nir_ubfe_imm(b, nir_load_packed_passthrough_primitive_amd(b), 9 * v, 8);
122       } else if (s->options->passthrough) {
123          vtx = nir_ubfe_imm(b, nir_load_packed_passthrough_primitive_amd(b), 10 * v, 9);
124       } else {
125          vtx = nir_ubfe_imm(b, nir_load_gs_vertex_offset_amd(b, .base = v / 2u),
126                             (v & 1u) * 16u, 16u);
127       }
128 
129       nir_store_var(b, s->gs_vtx_indices_vars[v], vtx, 0x1);
130    }
131 }
132 
133 static nir_def *
emit_ngg_nogs_prim_exp_arg(nir_builder * b,lower_ngg_nogs_state * s)134 emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *s)
135 {
136    if (s->options->hw_info->gfx_level >= GFX12 || s->options->passthrough) {
137       return nir_load_packed_passthrough_primitive_amd(b);
138    } else {
139       nir_def *vtx_idx[3] = {0};
140 
141       for (unsigned v = 0; v < s->options->num_vertices_per_primitive; ++v)
142          vtx_idx[v] = nir_load_var(b, s->gs_vtx_indices_vars[v]);
143 
144       return ac_nir_pack_ngg_prim_exp_arg(b, s->options->num_vertices_per_primitive, vtx_idx, NULL,
145                                         s->options->hw_info->gfx_level);
146    }
147 }
148 
149 static nir_def *
has_input_vertex(nir_builder * b)150 has_input_vertex(nir_builder *b)
151 {
152    return nir_is_subgroup_invocation_lt_amd(b, nir_load_merged_wave_info_amd(b));
153 }
154 
155 static nir_def *
has_input_primitive(nir_builder * b)156 has_input_primitive(nir_builder *b)
157 {
158    return nir_is_subgroup_invocation_lt_amd(b, nir_load_merged_wave_info_amd(b), .base = 8);
159 }
160 
161 static void
nogs_prim_gen_query(nir_builder * b,lower_ngg_nogs_state * s)162 nogs_prim_gen_query(nir_builder *b, lower_ngg_nogs_state *s)
163 {
164    if (!s->options->has_gen_prim_query)
165       return;
166 
167    nir_if *if_shader_query = nir_push_if(b, nir_load_prim_gen_query_enabled_amd(b));
168    {
169       /* Activate only 1 lane and add the number of primitives to query result. */
170       nir_if *if_elected = nir_push_if(b, nir_elect(b, 1));
171       {
172          /* Number of input primitives in the current wave. */
173          nir_def *num_input_prims = nir_ubfe_imm(b, nir_load_merged_wave_info_amd(b),
174                                                      8, 8);
175 
176          /* Add to stream 0 primitive generated counter. */
177          nir_atomic_add_gen_prim_count_amd(b, num_input_prims, .stream_id = 0);
178       }
179       nir_pop_if(b, if_elected);
180    }
181    nir_pop_if(b, if_shader_query);
182 }
183 
184 static nir_if *
emit_ngg_nogs_prim_export(nir_builder * b,lower_ngg_nogs_state * s,nir_def * arg)185 emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
186 {
187    nir_if *if_gs_thread = nir_push_if(b, nir_load_var(b, s->gs_exported_var));
188    {
189       if (!arg)
190          arg = emit_ngg_nogs_prim_exp_arg(b, s);
191 
192       /* pack user edge flag info into arg */
193       if (s->has_user_edgeflags) {
194          /* Workgroup barrier: wait for ES threads store user edge flags to LDS */
195          nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
196                             .memory_scope = SCOPE_WORKGROUP,
197                             .memory_semantics = NIR_MEMORY_ACQ_REL,
198                             .memory_modes = nir_var_mem_shared);
199 
200          unsigned edge_flag_bits = ac_get_all_edge_flag_bits(s->options->hw_info->gfx_level);
201          nir_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32);
202 
203          unsigned edge_flag_offset = 0;
204          if (s->streamout_enabled) {
205             unsigned packed_location =
206                util_bitcount64(b->shader->info.outputs_written &
207                                BITFIELD64_MASK(VARYING_SLOT_EDGE));
208             edge_flag_offset = packed_location * 16;
209          }
210 
211          for (int i = 0; i < s->options->num_vertices_per_primitive; i++) {
212             nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
213             nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
214             nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset);
215 
216             if (s->options->hw_info->gfx_level >= GFX12)
217                mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
218             else
219                mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 9 + i * 10));
220          }
221          arg = nir_iand(b, arg, mask);
222       }
223 
224       ac_nir_export_primitive(b, arg, NULL);
225 
226       /* Store implicit primitive ID when configured as a per-primitive output on
227        * GPUs without an attribute ring.
228        * Because this uses the export space, do it together with the primitive export.
229        */
230       if (!s->options->hw_info->has_attr_ring && s->options->export_primitive_id_per_prim) {
231          const uint8_t offset = s->options->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
232          nir_def *prim_id = nir_load_primitive_id(b);
233          nir_def *undef = nir_undef(b, 1, 32);
234          ac_nir_prerast_out out = {
235             .infos = {{.components_mask = 1, .as_varying_mask = 1}},
236             .outputs = {{prim_id, undef, undef, undef}}
237          };
238 
239          ac_nir_export_parameters(b, &offset, 1, 0, &out);
240       }
241    }
242    nir_pop_if(b, if_gs_thread);
243    return if_gs_thread;
244 }
245 
246 static void
emit_ngg_nogs_prim_id_store_shared(nir_builder * b,lower_ngg_nogs_state * s)247 emit_ngg_nogs_prim_id_store_shared(nir_builder *b, lower_ngg_nogs_state *s)
248 {
249    nir_def *gs_thread =
250       s->gs_accepted_var ? nir_load_var(b, s->gs_accepted_var) : has_input_primitive(b);
251 
252    nir_if *if_gs_thread = nir_push_if(b, gs_thread);
253    {
254       /* Copy Primitive IDs from GS threads to the LDS address
255        * corresponding to the ES thread of the provoking vertex.
256        * It will be exported as a per-vertex attribute.
257        */
258       nir_def *gs_vtx_indices[3];
259       for (unsigned i = 0; i < s->options->num_vertices_per_primitive; i++)
260          gs_vtx_indices[i] = nir_load_var(b, s->gs_vtx_indices_vars[i]);
261 
262       nir_def *provoking_vertex = nir_load_provoking_vtx_in_prim_amd(b);
263       nir_def *provoking_vtx_idx = nir_select_from_ssa_def_array(
264          b, gs_vtx_indices, s->options->num_vertices_per_primitive, provoking_vertex);
265 
266       nir_def *prim_id = nir_load_primitive_id(b);
267       nir_def *addr = pervertex_lds_addr(b, provoking_vtx_idx, s->pervertex_lds_bytes);
268 
269       /* primitive id is always at last of a vertex */
270       nir_store_shared(b, prim_id, addr, .base = s->pervertex_lds_bytes - 4);
271    }
272    nir_pop_if(b, if_gs_thread);
273 }
274 
275 /* Store implicit primitive ID when configured as a per-primitive output
276  * on GPUs with an attribute ring.
277  * This is done separately from the primitive export in order to
278  * optimize attribute ring access.
279  */
280 static void
emit_ngg_nogs_prim_id_store_per_prim_to_attr_ring(nir_builder * b,lower_ngg_nogs_state * s)281 emit_ngg_nogs_prim_id_store_per_prim_to_attr_ring(nir_builder *b, lower_ngg_nogs_state *s)
282 {
283    assert(s->options->hw_info->has_attr_ring);
284 
285    nir_def *is_gs_thread = nir_load_var(b, s->gs_exported_var);
286    nir_def *highest_gs_thread = nir_ufind_msb(b, nir_ballot(b, 1, s->options->wave_size, is_gs_thread));
287    nir_def *max_num_gs_threads = nir_iadd_imm_nuw(b, highest_gs_thread, 1);
288 
289    const uint8_t offset = s->options->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
290    ac_nir_prerast_out out = {
291       .infos = {{.components_mask = 1, .as_varying_mask = 1}},
292       .outputs = {{nir_load_primitive_id(b), NULL, NULL, NULL}}
293    };
294 
295    ac_nir_store_parameters_to_attr_ring(b, &offset, 1, 0, &out, max_num_gs_threads);
296 }
297 
298 static void
emit_store_ngg_nogs_es_primitive_id(nir_builder * b,lower_ngg_nogs_state * s)299 emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *s)
300 {
301    nir_def *prim_id = NULL;
302 
303    if (b->shader->info.stage == MESA_SHADER_VERTEX) {
304       /* LDS address where the primitive ID is stored */
305       nir_def *thread_id_in_threadgroup = nir_load_local_invocation_index(b);
306       nir_def *addr =
307          pervertex_lds_addr(b, thread_id_in_threadgroup, s->pervertex_lds_bytes);
308 
309       /* Load primitive ID from LDS */
310       prim_id = nir_load_shared(b, 1, 32, addr, .base = s->pervertex_lds_bytes - 4);
311    } else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
312       /* Just use tess eval primitive ID, which is the same as the patch ID. */
313       prim_id = nir_load_primitive_id(b);
314    }
315 
316    s->out.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = prim_id;
317    s->out.infos[VARYING_SLOT_PRIMITIVE_ID].as_varying_mask |= 1;
318 
319    /* Update outputs_written to reflect that the pass added a new output. */
320    b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID;
321 }
322 
323 static void
add_clipdist_bit(nir_builder * b,nir_def * dist,unsigned index,nir_variable * mask)324 add_clipdist_bit(nir_builder *b, nir_def *dist, unsigned index, nir_variable *mask)
325 {
326    nir_def *is_neg = nir_flt_imm(b, dist, 0);
327    nir_def *neg_mask = nir_ishl_imm(b, nir_b2i32(b, is_neg), index);
328    neg_mask = nir_ior(b, neg_mask, nir_load_var(b, mask));
329    nir_store_var(b, mask, neg_mask, 1);
330 }
331 
332 static bool
remove_culling_shader_output(nir_builder * b,nir_intrinsic_instr * intrin,void * state)333 remove_culling_shader_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
334 {
335    lower_ngg_nogs_state *s = (lower_ngg_nogs_state *) state;
336 
337    /* These are not allowed in VS / TES */
338    assert(intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
339           intrin->intrinsic != nir_intrinsic_load_per_vertex_input);
340 
341    /* We are only interested in output stores now */
342    if (intrin->intrinsic != nir_intrinsic_store_output)
343       return false;
344 
345    b->cursor = nir_before_instr(&intrin->instr);
346 
347    /* no indirect output */
348    assert(nir_src_is_const(intrin->src[1]) && nir_src_as_uint(intrin->src[1]) == 0);
349 
350    unsigned writemask = nir_intrinsic_write_mask(intrin);
351    unsigned component = nir_intrinsic_component(intrin);
352    nir_def *store_val = intrin->src[0].ssa;
353 
354    /* Position output - store the value to a variable, remove output store */
355    nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
356    switch (io_sem.location) {
357    case VARYING_SLOT_POS:
358       ac_nir_store_var_components(b, s->position_value_var, store_val, component, writemask);
359       break;
360    case VARYING_SLOT_CLIP_DIST0:
361    case VARYING_SLOT_CLIP_DIST1: {
362       unsigned base = io_sem.location == VARYING_SLOT_CLIP_DIST1 ? 4 : 0;
363       base += component;
364 
365       /* valid clipdist component mask */
366       unsigned mask = (s->options->clip_cull_dist_mask >> base) & writemask;
367       u_foreach_bit(i, mask) {
368          add_clipdist_bit(b, nir_channel(b, store_val, i), base + i,
369                           s->clipdist_neg_mask_var);
370          s->has_clipdist = true;
371       }
372       break;
373    }
374    case VARYING_SLOT_CLIP_VERTEX:
375       ac_nir_store_var_components(b, s->clip_vertex_var, store_val, component, writemask);
376       break;
377    default:
378       break;
379    }
380 
381    /* Remove all output stores */
382    nir_instr_remove(&intrin->instr);
383    return true;
384 }
385 
386 static void
remove_culling_shader_outputs(nir_shader * culling_shader,lower_ngg_nogs_state * s)387 remove_culling_shader_outputs(nir_shader *culling_shader, lower_ngg_nogs_state *s)
388 {
389    nir_shader_intrinsics_pass(culling_shader, remove_culling_shader_output,
390                               nir_metadata_control_flow, s);
391 
392    /* Remove dead code resulting from the deleted outputs. */
393    bool progress;
394    do {
395       progress = false;
396       NIR_PASS(progress, culling_shader, nir_opt_dead_write_vars);
397       NIR_PASS(progress, culling_shader, nir_opt_dce);
398       NIR_PASS(progress, culling_shader, nir_opt_dead_cf);
399    } while (progress);
400 }
401 
402 static void
rewrite_uses_to_var(nir_builder * b,nir_def * old_def,nir_variable * replacement_var,unsigned replacement_var_channel)403 rewrite_uses_to_var(nir_builder *b, nir_def *old_def, nir_variable *replacement_var, unsigned replacement_var_channel)
404 {
405    if (old_def->parent_instr->type == nir_instr_type_load_const)
406       return;
407 
408    b->cursor = nir_after_instr(old_def->parent_instr);
409    if (b->cursor.instr->type == nir_instr_type_phi)
410       b->cursor = nir_after_phis(old_def->parent_instr->block);
411 
412    nir_def *pos_val_rep = nir_load_var(b, replacement_var);
413    nir_def *replacement = nir_channel(b, pos_val_rep, replacement_var_channel);
414 
415    if (old_def->num_components > 1) {
416       /* old_def uses a swizzled vector component.
417        * There is no way to replace the uses of just a single vector component,
418        * so instead create a new vector and replace all uses of the old vector.
419        */
420       nir_def *old_def_elements[NIR_MAX_VEC_COMPONENTS] = {0};
421       for (unsigned j = 0; j < old_def->num_components; ++j)
422          old_def_elements[j] = nir_channel(b, old_def, j);
423       replacement = nir_vec(b, old_def_elements, old_def->num_components);
424    }
425 
426    nir_def_rewrite_uses_after(old_def, replacement, replacement->parent_instr);
427 }
428 
429 static bool
remove_extra_pos_output(nir_builder * b,nir_intrinsic_instr * intrin,void * state)430 remove_extra_pos_output(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
431 {
432    lower_ngg_nogs_state *s = (lower_ngg_nogs_state *) state;
433 
434    /* These are not allowed in VS / TES */
435    assert(intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
436           intrin->intrinsic != nir_intrinsic_load_per_vertex_input);
437 
438    /* We are only interested in output stores now */
439    if (intrin->intrinsic != nir_intrinsic_store_output)
440       return false;
441 
442    nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
443    if (io_sem.location != VARYING_SLOT_POS)
444       return false;
445 
446    b->cursor = nir_before_instr(&intrin->instr);
447 
448    /* In case other outputs use what we calculated for pos,
449     * try to avoid calculating it again by rewriting the usages
450     * of the store components here.
451     */
452    nir_def *store_val = intrin->src[0].ssa;
453    unsigned store_pos_component = nir_intrinsic_component(intrin);
454 
455    nir_instr_remove(&intrin->instr);
456 
457    if (store_val->parent_instr->type == nir_instr_type_alu) {
458       nir_alu_instr *alu = nir_instr_as_alu(store_val->parent_instr);
459       if (nir_op_is_vec_or_mov(alu->op)) {
460          /* Output store uses a vector, we can easily rewrite uses of each vector element. */
461 
462          unsigned num_vec_src = 0;
463          if (alu->op == nir_op_mov)
464             num_vec_src = 1;
465          else if (alu->op == nir_op_vec2)
466             num_vec_src = 2;
467          else if (alu->op == nir_op_vec3)
468             num_vec_src = 3;
469          else if (alu->op == nir_op_vec4)
470             num_vec_src = 4;
471          assert(num_vec_src);
472 
473          /* Remember the current components whose uses we wish to replace.
474           * This is needed because rewriting one source can affect the others too.
475           */
476          nir_def *vec_comps[NIR_MAX_VEC_COMPONENTS] = {0};
477          for (unsigned i = 0; i < num_vec_src; i++)
478             vec_comps[i] = alu->src[i].src.ssa;
479 
480          for (unsigned i = 0; i < num_vec_src; i++)
481             rewrite_uses_to_var(b, vec_comps[i], s->position_value_var, store_pos_component + i);
482       } else {
483          rewrite_uses_to_var(b, store_val, s->position_value_var, store_pos_component);
484       }
485    } else {
486       rewrite_uses_to_var(b, store_val, s->position_value_var, store_pos_component);
487    }
488 
489    return true;
490 }
491 
492 static void
remove_extra_pos_outputs(nir_shader * shader,lower_ngg_nogs_state * s)493 remove_extra_pos_outputs(nir_shader *shader, lower_ngg_nogs_state *s)
494 {
495    nir_shader_intrinsics_pass(shader, remove_extra_pos_output,
496                               nir_metadata_control_flow, s);
497 }
498 
499 static bool
remove_compacted_arg(lower_ngg_nogs_state * s,nir_builder * b,unsigned idx)500 remove_compacted_arg(lower_ngg_nogs_state *s, nir_builder *b, unsigned idx)
501 {
502    nir_instr *store_instr = s->compact_arg_stores[idx];
503    if (!store_instr)
504       return false;
505 
506    /* Simply remove the store. */
507    nir_instr_remove(store_instr);
508 
509    /* Find the intrinsic that overwrites the shader arguments,
510     * and change its corresponding source.
511     * This will cause NIR's DCE to recognize the load and its phis as dead.
512     */
513    b->cursor = nir_before_instr(&s->overwrite_args->instr);
514    nir_def *undef_arg = nir_undef(b, 1, 32);
515    nir_def_rewrite_uses(s->overwrite_args->src[idx].ssa, undef_arg);
516 
517    s->compact_arg_stores[idx] = NULL;
518    return true;
519 }
520 
521 static bool
cleanup_culling_shader_after_dce(nir_shader * shader,nir_function_impl * function_impl,lower_ngg_nogs_state * s)522 cleanup_culling_shader_after_dce(nir_shader *shader,
523                                  nir_function_impl *function_impl,
524                                  lower_ngg_nogs_state *s)
525 {
526    bool uses_vs_vertex_id = false;
527    bool uses_vs_instance_id = false;
528    bool uses_tes_u = false;
529    bool uses_tes_v = false;
530    bool uses_tes_rel_patch_id = false;
531    bool uses_tes_patch_id = false;
532 
533    bool progress = false;
534    nir_builder b = nir_builder_create(function_impl);
535 
536    nir_foreach_block_reverse_safe(block, function_impl) {
537       nir_foreach_instr_reverse_safe(instr, block) {
538          if (instr->type != nir_instr_type_intrinsic)
539             continue;
540 
541          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
542 
543          switch (intrin->intrinsic) {
544          case nir_intrinsic_sendmsg_amd:
545             goto cleanup_culling_shader_after_dce_done;
546          case nir_intrinsic_load_vertex_id:
547          case nir_intrinsic_load_vertex_id_zero_base:
548             uses_vs_vertex_id = true;
549             break;
550          case nir_intrinsic_load_instance_id:
551             uses_vs_instance_id = true;
552             break;
553          case nir_intrinsic_load_input: {
554             const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
555             if (s->options->instance_rate_inputs & BITFIELD_BIT(io_sem.location))
556                uses_vs_instance_id = true;
557             else
558                uses_vs_vertex_id = true;
559             break;
560          }
561          case nir_intrinsic_load_tess_coord:
562             uses_tes_u = uses_tes_v = true;
563             break;
564          case nir_intrinsic_load_tess_rel_patch_id_amd:
565             uses_tes_rel_patch_id = true;
566             break;
567          case nir_intrinsic_load_primitive_id:
568             if (shader->info.stage == MESA_SHADER_TESS_EVAL)
569                uses_tes_patch_id = true;
570             break;
571          default:
572             break;
573          }
574       }
575    }
576 
577    cleanup_culling_shader_after_dce_done:
578 
579    if (shader->info.stage == MESA_SHADER_VERTEX) {
580       if (!uses_vs_vertex_id)
581          progress |= remove_compacted_arg(s, &b, 0);
582       if (!uses_vs_instance_id)
583          progress |= remove_compacted_arg(s, &b, 1);
584    } else if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
585       if (!uses_tes_u)
586          progress |= remove_compacted_arg(s, &b, 0);
587       if (!uses_tes_v)
588          progress |= remove_compacted_arg(s, &b, 1);
589       if (!uses_tes_rel_patch_id)
590          progress |= remove_compacted_arg(s, &b, 3);
591       if (!uses_tes_patch_id)
592          progress |= remove_compacted_arg(s, &b, 2);
593    }
594 
595    return progress;
596 }
597 
598 /**
599  * Perform vertex compaction after culling.
600  *
601  * 1. Repack surviving ES invocations (this determines which lane will export which vertex)
602  * 2. Surviving ES vertex invocations store their data to LDS
603  * 3. Emit GS_ALLOC_REQ
604  * 4. Repacked invocations load the vertex data from LDS
605  * 5. GS threads update their vertex indices
606  * 6. Optionally, do the same for primitives.
607  */
608 static void
compact_vertices_after_culling(nir_builder * b,lower_ngg_nogs_state * s,nir_variable ** repacked_variables,nir_variable ** gs_vtxaddr_vars,nir_def * invocation_index,nir_def * es_vertex_lds_addr,nir_def * es_exporter_tid,nir_def * num_live_vertices_in_workgroup,nir_def * gs_exporter_tid,nir_def * num_live_primitives_in_workgroup,unsigned pervertex_lds_bytes,unsigned num_repacked_variables)609 compact_vertices_after_culling(nir_builder *b,
610                                lower_ngg_nogs_state *s,
611                                nir_variable **repacked_variables,
612                                nir_variable **gs_vtxaddr_vars,
613                                nir_def *invocation_index,
614                                nir_def *es_vertex_lds_addr,
615                                nir_def *es_exporter_tid,
616                                nir_def *num_live_vertices_in_workgroup,
617                                nir_def *gs_exporter_tid,
618                                nir_def *num_live_primitives_in_workgroup,
619                                unsigned pervertex_lds_bytes,
620                                unsigned num_repacked_variables)
621 {
622    nir_if *if_es_accepted = nir_push_if(b, nir_load_var(b, s->es_accepted_var));
623    {
624       nir_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes);
625 
626       /* Store the exporter thread's index to the LDS space of the current thread so GS threads can load it */
627       nir_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid);
628 
629       /* Store the current thread's position output to the exporter thread's LDS space */
630       nir_def *pos = nir_load_var(b, s->position_value_var);
631       nir_store_shared(b, pos, exporter_addr, .base = lds_es_pos_x);
632 
633       /* Store the current thread's repackable arguments to the exporter thread's LDS space */
634       for (unsigned i = 0; i < num_repacked_variables; ++i) {
635          nir_def *arg_val = nir_load_var(b, repacked_variables[i]);
636          nir_intrinsic_instr *store = nir_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i);
637 
638          s->compact_arg_stores[i] = &store->instr;
639       }
640 
641       /* TES rel patch id does not cost extra dword */
642       if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
643          nir_def *arg_val = nir_load_var(b, s->repacked_rel_patch_id);
644          nir_intrinsic_instr *store =
645             nir_store_shared(b, nir_u2u8(b, arg_val), exporter_addr,
646                              .base = lds_es_tes_rel_patch_id);
647 
648          s->compact_arg_stores[3] = &store->instr;
649       }
650    }
651    nir_pop_if(b, if_es_accepted);
652 
653    /* TODO: Consider adding a shortcut exit.
654     * Waves that have no vertices and primitives left can s_endpgm right here.
655     */
656 
657    nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
658                          .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
659 
660    nir_def *es_survived = nir_ilt(b, invocation_index, num_live_vertices_in_workgroup);
661    nir_if *if_packed_es_thread = nir_push_if(b, es_survived);
662    {
663       /* Read position from the current ES thread's LDS space (written by the exported vertex's ES thread) */
664       nir_def *exported_pos = nir_load_shared(b, 4, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
665       nir_store_var(b, s->position_value_var, exported_pos, 0xfu);
666 
667       /* Read the repacked arguments */
668       for (unsigned i = 0; i < num_repacked_variables; ++i) {
669          nir_def *arg_val = nir_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_arg_0 + 4u * i);
670          nir_store_var(b, repacked_variables[i], arg_val, 0x1u);
671       }
672 
673       if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
674          nir_def *arg_val = nir_load_shared(b, 1, 8, es_vertex_lds_addr,
675                                                 .base = lds_es_tes_rel_patch_id);
676          nir_store_var(b, s->repacked_rel_patch_id, nir_u2u32(b, arg_val), 0x1u);
677       }
678    }
679    nir_push_else(b, if_packed_es_thread);
680    {
681       nir_store_var(b, s->position_value_var, nir_undef(b, 4, 32), 0xfu);
682       for (unsigned i = 0; i < num_repacked_variables; ++i)
683          nir_store_var(b, repacked_variables[i], nir_undef(b, 1, 32), 0x1u);
684    }
685    nir_pop_if(b, if_packed_es_thread);
686 
687    nir_def *gs_accepted = nir_load_var(b, s->gs_accepted_var);
688    nir_if *if_gs_accepted = nir_push_if(b, gs_accepted);
689    {
690       nir_def *exporter_vtx_indices[3] = {0};
691 
692       /* Load the index of the ES threads that will export the current GS thread's vertices */
693       for (unsigned v = 0; v < s->options->num_vertices_per_primitive; ++v) {
694          nir_def *vtx_addr = nir_load_var(b, gs_vtxaddr_vars[v]);
695          nir_def *exporter_vtx_idx = nir_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid);
696          exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx);
697          nir_store_var(b, s->gs_vtx_indices_vars[v], exporter_vtx_indices[v], 0x1);
698       }
699 
700       nir_def *prim_exp_arg =
701          ac_nir_pack_ngg_prim_exp_arg(b, s->options->num_vertices_per_primitive,
702                                     exporter_vtx_indices, NULL, s->options->hw_info->gfx_level);
703       nir_store_var(b, s->prim_exp_arg_var, prim_exp_arg, 0x1u);
704    }
705    nir_pop_if(b, if_gs_accepted);
706 
707    nir_store_var(b, s->es_accepted_var, es_survived, 0x1u);
708 
709    if (s->options->compact_primitives) {
710       /* For primitive compaction, re-use the same LDS space that we used for
711        * vertex compaction, so we need to wait until vertex threads are finished reading it.
712        * Considering we only need 1 DWORD per primitive, let's assume we always have enough space,
713        * since vertex compaction requires at least 5 DWORDs per vertex.
714        */
715       nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
716                      .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
717 
718       if_gs_accepted = nir_push_if(b, gs_accepted);
719       {
720          nir_def *exporter_addr = pervertex_lds_addr(b, gs_exporter_tid, pervertex_lds_bytes);
721          nir_def *prim_exp_arg = nir_load_var(b, s->prim_exp_arg_var);
722 
723          /* Store the primitive export argument into the address of the exporter thread. */
724          nir_store_shared(b, prim_exp_arg, exporter_addr, .base = lds_es_pos_x);
725       }
726       nir_pop_if(b, if_gs_accepted);
727 
728       nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
729                      .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
730 
731       nir_def *gs_survived = nir_ilt(b, invocation_index, num_live_primitives_in_workgroup);
732       nir_if *if_packed_gs_thread = nir_push_if(b, gs_survived);
733       {
734          /* Load the primitive export argument that the current thread will export. */
735          nir_def *prim_exp_arg = nir_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
736 
737          nir_store_var(b, s->prim_exp_arg_var, prim_exp_arg, 0x1u);
738       }
739       nir_push_else(b, if_packed_gs_thread);
740       {
741          nir_store_var(b, s->prim_exp_arg_var, nir_undef(b, 1, 32), 0x1u);
742       }
743       nir_pop_if(b, if_packed_gs_thread);
744 
745       nir_store_var(b, s->gs_accepted_var, gs_survived, 0x1u);
746       nir_store_var(b, s->gs_exported_var, gs_survived, 0x1u);
747    }
748 }
749 
750 static void
analyze_shader_before_culling_walk(nir_def * ssa,uint8_t flag,lower_ngg_nogs_state * s)751 analyze_shader_before_culling_walk(nir_def *ssa,
752                                    uint8_t flag,
753                                    lower_ngg_nogs_state *s)
754 {
755    nir_instr *instr = ssa->parent_instr;
756    uint8_t old_pass_flags = instr->pass_flags;
757    instr->pass_flags |= flag;
758 
759    if (instr->pass_flags == old_pass_flags)
760       return; /* Already visited. */
761 
762    switch (instr->type) {
763    case nir_instr_type_intrinsic: {
764       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
765 
766       /* VS input loads and SSBO loads are actually VRAM reads on AMD HW. */
767       switch (intrin->intrinsic) {
768       case nir_intrinsic_load_input: {
769          nir_io_semantics in_io_sem = nir_intrinsic_io_semantics(intrin);
770          uint64_t in_mask = UINT64_C(1) << (uint64_t) in_io_sem.location;
771          if (instr->pass_flags & nggc_passflag_used_by_pos)
772             s->inputs_needed_by_pos |= in_mask;
773          else if (instr->pass_flags & nggc_passflag_used_by_other)
774             s->inputs_needed_by_others |= in_mask;
775          break;
776       }
777       default:
778          break;
779       }
780 
781       break;
782    }
783    case nir_instr_type_alu: {
784       nir_alu_instr *alu = nir_instr_as_alu(instr);
785       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
786 
787       for (unsigned i = 0; i < num_srcs; ++i) {
788          analyze_shader_before_culling_walk(alu->src[i].src.ssa, flag, s);
789       }
790 
791       break;
792    }
793    case nir_instr_type_tex: {
794       nir_tex_instr *tex = nir_instr_as_tex(instr);
795       unsigned num_srcs = tex->num_srcs;
796 
797       for (unsigned i = 0; i < num_srcs; ++i) {
798          analyze_shader_before_culling_walk(tex->src[i].src.ssa, flag, s);
799       }
800 
801       break;
802    }
803    case nir_instr_type_phi: {
804       nir_phi_instr *phi = nir_instr_as_phi(instr);
805       nir_foreach_phi_src_safe(phi_src, phi) {
806          analyze_shader_before_culling_walk(phi_src->src.ssa, flag, s);
807       }
808 
809       break;
810    }
811    default:
812       break;
813    }
814 }
815 
816 static void
analyze_shader_before_culling(nir_shader * shader,lower_ngg_nogs_state * s)817 analyze_shader_before_culling(nir_shader *shader, lower_ngg_nogs_state *s)
818 {
819    /* We need divergence info for culling shaders. */
820    nir_divergence_analysis(shader);
821 
822    nir_foreach_function_impl(impl, shader) {
823       nir_foreach_block(block, impl) {
824          nir_foreach_instr(instr, block) {
825             instr->pass_flags = 0;
826 
827             if (instr->type != nir_instr_type_intrinsic)
828                continue;
829 
830             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
831             if (intrin->intrinsic != nir_intrinsic_store_output)
832                continue;
833 
834             nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
835             nir_def *store_val = intrin->src[0].ssa;
836             uint8_t flag = io_sem.location == VARYING_SLOT_POS ? nggc_passflag_used_by_pos : nggc_passflag_used_by_other;
837             analyze_shader_before_culling_walk(store_val, flag, s);
838          }
839       }
840    }
841 }
842 
843 static nir_def *
find_reusable_ssa_def(nir_instr * instr)844 find_reusable_ssa_def(nir_instr *instr)
845 {
846    /* Find instructions whose SSA definitions are used by both
847     * the top and bottom parts of the shader (before and after culling).
848     * Only in this case, it makes sense for the bottom part
849     * to try to reuse these from the top part.
850     */
851    if ((instr->pass_flags & nggc_passflag_used_by_both) != nggc_passflag_used_by_both)
852       return NULL;
853 
854    switch (instr->type) {
855    case nir_instr_type_alu: {
856       nir_alu_instr *alu = nir_instr_as_alu(instr);
857       if (alu->def.divergent)
858          return NULL;
859       /* Ignore uniform floats because they regress VGPR usage too much */
860       if (nir_op_infos[alu->op].output_type & nir_type_float)
861          return NULL;
862       return &alu->def;
863    }
864    case nir_instr_type_intrinsic: {
865       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
866       if (!nir_intrinsic_can_reorder(intrin) ||
867             !nir_intrinsic_infos[intrin->intrinsic].has_dest ||
868             intrin->def.divergent)
869          return NULL;
870       return &intrin->def;
871    }
872    case nir_instr_type_phi: {
873       nir_phi_instr *phi = nir_instr_as_phi(instr);
874       if (phi->def.divergent)
875          return NULL;
876       return &phi->def;
877    }
878    default:
879       return NULL;
880    }
881 }
882 
883 static const struct glsl_type *
glsl_uint_type_for_ssa(nir_def * ssa)884 glsl_uint_type_for_ssa(nir_def *ssa)
885 {
886    enum glsl_base_type base_type = GLSL_TYPE_UINT;
887    switch (ssa->bit_size) {
888    case 8: base_type = GLSL_TYPE_UINT8; break;
889    case 16: base_type = GLSL_TYPE_UINT16; break;
890    case 32: base_type = GLSL_TYPE_UINT; break;
891    case 64: base_type = GLSL_TYPE_UINT64; break;
892    default: return NULL;
893    }
894 
895    return ssa->num_components == 1
896           ? glsl_scalar_type(base_type)
897           : glsl_vector_type(base_type, ssa->num_components);
898 }
899 
900 /**
901  * Save the reusable SSA definitions to variables so that the
902  * bottom shader part can reuse them from the top part.
903  *
904  * 1. We create a new function temporary variable for reusables,
905  *    and insert a store+load.
906  * 2. The shader is cloned (the top part is created), then the
907  *    control flow is reinserted (for the bottom part.)
908  * 3. For reusables, we delete the variable stores from the
909  *    bottom part. This will make them use the variables from
910  *    the top part and DCE the redundant instructions.
911  */
912 static void
save_reusable_variables(nir_builder * b,lower_ngg_nogs_state * s)913 save_reusable_variables(nir_builder *b, lower_ngg_nogs_state *s)
914 {
915    ASSERTED int vec_ok = u_vector_init(&s->reusable_nondeferred_variables, 4, sizeof(reusable_nondeferred_variable));
916    assert(vec_ok);
917 
918    /* Upper limit on reusable uniforms in order to reduce SGPR spilling. */
919    unsigned remaining_reusable_uniforms = 48;
920 
921    nir_block *block = nir_start_block(b->impl);
922    while (block) {
923       /* Process the instructions in the current block. */
924       nir_foreach_instr_safe(instr, block) {
925          /* Determine if we can reuse the current SSA value.
926           * When vertex compaction is used, it is possible that the same shader invocation
927           * processes a different vertex in the top and bottom part of the shader.
928           * Therefore, we only reuse uniform values.
929           */
930          nir_def *ssa = find_reusable_ssa_def(instr);
931          if (!ssa)
932             continue;
933 
934          /* Determine a suitable type for the SSA value. */
935          const struct glsl_type *t = glsl_uint_type_for_ssa(ssa);
936          if (!t)
937             continue;
938 
939          if (!ssa->divergent) {
940             if (remaining_reusable_uniforms < ssa->num_components)
941                continue;
942 
943             remaining_reusable_uniforms -= ssa->num_components;
944          }
945 
946          reusable_nondeferred_variable *saved = (reusable_nondeferred_variable *) u_vector_add(&s->reusable_nondeferred_variables);
947          assert(saved);
948 
949          /* Create a new NIR variable where we store the reusable value.
950           * Then, we reload the variable and replace the uses of the value
951           * with the reloaded variable.
952           */
953          saved->var = nir_local_variable_create(b->impl, t, NULL);
954          saved->ssa = ssa;
955 
956          b->cursor = instr->type == nir_instr_type_phi
957                      ? nir_after_instr_and_phis(instr)
958                      : nir_after_instr(instr);
959          nir_store_var(b, saved->var, saved->ssa, BITFIELD_MASK(ssa->num_components));
960          nir_def *reloaded = nir_load_var(b, saved->var);
961          nir_def_rewrite_uses_after(ssa, reloaded, reloaded->parent_instr);
962       }
963 
964       /* Look at the next CF node. */
965       nir_cf_node *next_cf_node = nir_cf_node_next(&block->cf_node);
966       if (next_cf_node) {
967          /* It makes no sense to try to reuse things from within loops. */
968          bool next_is_loop = next_cf_node->type == nir_cf_node_loop;
969 
970          /* Don't reuse if we're in divergent control flow.
971           *
972           * Thanks to vertex repacking, the same shader invocation may process a different vertex
973           * in the top and bottom part, and it's even possible that this different vertex was initially
974           * processed in a different wave. So the two parts may take a different divergent code path.
975           * Therefore, these variables in divergent control flow may stay undefined.
976           *
977           * Note that this problem doesn't exist if vertices are not repacked or if the
978           * workgroup only has a single wave.
979           */
980          bool next_is_divergent_if =
981             next_cf_node->type == nir_cf_node_if &&
982             nir_src_is_divergent(&nir_cf_node_as_if(next_cf_node)->condition);
983 
984          if (next_is_loop || next_is_divergent_if) {
985             block = nir_cf_node_cf_tree_next(next_cf_node);
986             continue;
987          }
988       }
989 
990       /* Go to the next block. */
991       block = nir_block_cf_tree_next(block);
992    }
993 }
994 
995 /**
996  * Reuses suitable variables from the top part of the shader,
997  * by deleting their stores from the bottom part.
998  */
999 static void
apply_reusable_variables(nir_builder * b,lower_ngg_nogs_state * s)1000 apply_reusable_variables(nir_builder *b, lower_ngg_nogs_state *s)
1001 {
1002    if (!u_vector_length(&s->reusable_nondeferred_variables)) {
1003       u_vector_finish(&s->reusable_nondeferred_variables);
1004       return;
1005    }
1006 
1007    nir_foreach_block_reverse_safe(block, b->impl) {
1008       nir_foreach_instr_reverse_safe(instr, block) {
1009          if (instr->type != nir_instr_type_intrinsic)
1010             continue;
1011          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1012 
1013          /* When we found any of these intrinsics, it means
1014           * we reached the top part and we must stop.
1015           */
1016          if (intrin->intrinsic == nir_intrinsic_sendmsg_amd)
1017             goto done;
1018 
1019          if (intrin->intrinsic != nir_intrinsic_store_deref)
1020             continue;
1021          nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1022          if (deref->deref_type != nir_deref_type_var)
1023             continue;
1024 
1025          reusable_nondeferred_variable *saved;
1026          u_vector_foreach(saved, &s->reusable_nondeferred_variables) {
1027             if (saved->var == deref->var) {
1028                nir_instr_remove(instr);
1029             }
1030          }
1031       }
1032    }
1033 
1034    done:
1035    u_vector_finish(&s->reusable_nondeferred_variables);
1036 }
1037 
1038 static void
cull_primitive_accepted(nir_builder * b,void * state)1039 cull_primitive_accepted(nir_builder *b, void *state)
1040 {
1041    lower_ngg_nogs_state *s = (lower_ngg_nogs_state *)state;
1042 
1043    nir_store_var(b, s->gs_accepted_var, nir_imm_true(b), 0x1u);
1044 
1045    /* Store the accepted state to LDS for ES threads */
1046    for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx)
1047       nir_store_shared(b, nir_imm_intN_t(b, 1, 8), s->vtx_addr[vtx], .base = lds_es_vertex_accepted);
1048 }
1049 
1050 static void
clipdist_culling_es_part(nir_builder * b,lower_ngg_nogs_state * s,nir_def * es_vertex_lds_addr)1051 clipdist_culling_es_part(nir_builder *b, lower_ngg_nogs_state *s,
1052                          nir_def *es_vertex_lds_addr)
1053 {
1054    /* no gl_ClipDistance used but we have user defined clip plane */
1055    if (s->options->user_clip_plane_enable_mask && !s->has_clipdist) {
1056       /* use gl_ClipVertex if defined */
1057       nir_variable *clip_vertex_var =
1058          b->shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX) ?
1059          s->clip_vertex_var : s->position_value_var;
1060       nir_def *clip_vertex = nir_load_var(b, clip_vertex_var);
1061 
1062       /* clip against user defined clip planes */
1063       for (unsigned i = 0; i < 8; i++) {
1064          if (!(s->options->user_clip_plane_enable_mask & BITFIELD_BIT(i)))
1065             continue;
1066 
1067          nir_def *plane = nir_load_user_clip_plane(b, .ucp_id = i);
1068          nir_def *dist = nir_fdot(b, clip_vertex, plane);
1069          add_clipdist_bit(b, dist, i, s->clipdist_neg_mask_var);
1070       }
1071 
1072       s->has_clipdist = true;
1073    }
1074 
1075    /* store clipdist_neg_mask to LDS for culling latter in gs thread */
1076    if (s->has_clipdist) {
1077       nir_def *mask = nir_load_var(b, s->clipdist_neg_mask_var);
1078       nir_store_shared(b, nir_u2u8(b, mask), es_vertex_lds_addr,
1079                        .base = lds_es_clipdist_neg_mask);
1080    }
1081 }
1082 
1083 static unsigned
ngg_nogs_get_culling_pervertex_lds_size(gl_shader_stage stage,bool uses_instance_id,bool uses_primitive_id,unsigned * num_repacked_variables)1084 ngg_nogs_get_culling_pervertex_lds_size(gl_shader_stage stage,
1085                                         bool uses_instance_id,
1086                                         bool uses_primitive_id,
1087                                         unsigned *num_repacked_variables)
1088 {
1089    /* Culling shaders must repack some variables because
1090     * the same shader invocation may process different vertices
1091     * before and after the culling algorithm.
1092     */
1093 
1094    unsigned num_repacked;
1095    if (stage == MESA_SHADER_VERTEX) {
1096       /* Vertex shaders repack:
1097        * - Vertex ID
1098        * - Instance ID (only if used)
1099        */
1100       num_repacked = uses_instance_id ? 2 : 1;
1101    } else {
1102       /* Tess eval shaders repack:
1103        * - U, V coordinates
1104        * - primitive ID (aka. patch id, only if used)
1105        * - relative patch id (not included here because doesn't need a dword)
1106        */
1107       assert(stage == MESA_SHADER_TESS_EVAL);
1108       num_repacked = uses_primitive_id ? 3 : 2;
1109    }
1110 
1111    if (num_repacked_variables)
1112       *num_repacked_variables = num_repacked;
1113 
1114    /* one odd dword to reduce LDS bank conflict */
1115    return (lds_es_arg_0 + num_repacked * 4u) | 4u;
1116 }
1117 
1118 static void
add_deferred_attribute_culling(nir_builder * b,nir_cf_list * original_extracted_cf,lower_ngg_nogs_state * s)1119 add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_cf, lower_ngg_nogs_state *s)
1120 {
1121    bool uses_instance_id = BITSET_TEST(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1122    bool uses_tess_primitive_id = BITSET_TEST(b->shader->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1123 
1124    unsigned num_repacked_variables;
1125    unsigned pervertex_lds_bytes =
1126       ngg_nogs_get_culling_pervertex_lds_size(b->shader->info.stage,
1127                                               uses_instance_id,
1128                                               uses_tess_primitive_id,
1129                                               &num_repacked_variables);
1130 
1131    nir_function_impl *impl = nir_shader_get_entrypoint(b->shader);
1132 
1133    /* Create some helper variables. */
1134    nir_variable *gs_vtxaddr_vars[3] = {
1135       nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx0_addr"),
1136       nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx1_addr"),
1137       nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx2_addr"),
1138    };
1139 
1140    nir_variable *repacked_variables[3] = {
1141       nir_local_variable_create(impl, glsl_uint_type(), "repacked_var_0"),
1142       nir_local_variable_create(impl, glsl_uint_type(), "repacked_var_1"),
1143       nir_local_variable_create(impl, glsl_uint_type(), "repacked_var_2"),
1144    };
1145 
1146    /* Relative patch ID is a special case because it doesn't need an extra dword, repack separately. */
1147    s->repacked_rel_patch_id = nir_local_variable_create(impl, glsl_uint_type(), "repacked_rel_patch_id");
1148 
1149    if (s->options->clip_cull_dist_mask ||
1150        s->options->user_clip_plane_enable_mask) {
1151       s->clip_vertex_var =
1152          nir_local_variable_create(impl, glsl_vec4_type(), "clip_vertex");
1153       s->clipdist_neg_mask_var =
1154          nir_local_variable_create(impl, glsl_uint_type(), "clipdist_neg_mask");
1155 
1156       /* init mask to 0 */
1157       nir_store_var(b, s->clipdist_neg_mask_var, nir_imm_int(b, 0), 1);
1158    }
1159 
1160    /* Top part of the culling shader (aka. position shader part)
1161     *
1162     * We clone the full ES shader and emit it here, but we only really care
1163     * about its position output, so we delete every other output from this part.
1164     * The position output is stored into a temporary variable, and reloaded later.
1165     */
1166 
1167    nir_def *es_thread = has_input_vertex(b);
1168    nir_if *if_es_thread = nir_push_if(b, es_thread);
1169    {
1170       /* Initialize the position output variable to zeroes, in case not all VS/TES invocations store the output.
1171        * The spec doesn't require it, but we use (0, 0, 0, 1) because some games rely on that.
1172        */
1173       nir_store_var(b, s->position_value_var, nir_imm_vec4(b, 0.0f, 0.0f, 0.0f, 1.0f), 0xfu);
1174 
1175       /* Now reinsert a clone of the shader code */
1176       struct hash_table *remap_table = _mesa_pointer_hash_table_create(NULL);
1177       nir_cf_list_clone_and_reinsert(original_extracted_cf, &if_es_thread->cf_node, b->cursor, remap_table);
1178       _mesa_hash_table_destroy(remap_table, NULL);
1179       b->cursor = nir_after_cf_list(&if_es_thread->then_list);
1180 
1181       /* Remember the current thread's shader arguments */
1182       if (b->shader->info.stage == MESA_SHADER_VERTEX) {
1183          nir_store_var(b, repacked_variables[0], nir_load_vertex_id_zero_base(b), 0x1u);
1184          if (uses_instance_id)
1185             nir_store_var(b, repacked_variables[1], nir_load_instance_id(b), 0x1u);
1186       } else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
1187          nir_store_var(b, s->repacked_rel_patch_id, nir_load_tess_rel_patch_id_amd(b), 0x1u);
1188          nir_def *tess_coord = nir_load_tess_coord(b);
1189          nir_store_var(b, repacked_variables[0], nir_channel(b, tess_coord, 0), 0x1u);
1190          nir_store_var(b, repacked_variables[1], nir_channel(b, tess_coord, 1), 0x1u);
1191          if (uses_tess_primitive_id)
1192             nir_store_var(b, repacked_variables[2], nir_load_primitive_id(b), 0x1u);
1193       } else {
1194          unreachable("Should be VS or TES.");
1195       }
1196    }
1197    nir_pop_if(b, if_es_thread);
1198 
1199    nir_store_var(b, s->es_accepted_var, es_thread, 0x1u);
1200    nir_def *gs_thread = has_input_primitive(b);
1201    nir_store_var(b, s->gs_accepted_var, gs_thread, 0x1u);
1202 
1203    /* Remove all non-position outputs, and put the position output into the variable. */
1204    nir_metadata_preserve(impl, nir_metadata_none);
1205    remove_culling_shader_outputs(b->shader, s);
1206    b->cursor = nir_after_impl(impl);
1207 
1208    nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b);
1209 
1210    /* Run culling algorithms if culling is enabled.
1211     *
1212     * NGG culling can be enabled or disabled in runtime.
1213     * This is determined by a SGPR shader argument which is accessed
1214     * by the following NIR intrinsic.
1215     */
1216 
1217    nir_if *if_cull_en = nir_push_if(b, nir_load_cull_any_enabled_amd(b));
1218    {
1219       nir_def *invocation_index = nir_load_local_invocation_index(b);
1220       nir_def *es_vertex_lds_addr = pervertex_lds_addr(b, invocation_index, pervertex_lds_bytes);
1221 
1222       /* ES invocations store their vertex data to LDS for GS threads to read. */
1223       if_es_thread = nir_push_if(b, es_thread);
1224       if_es_thread->control = nir_selection_control_divergent_always_taken;
1225       {
1226          /* Store position components that are relevant to culling in LDS */
1227          nir_def *pre_cull_pos = nir_load_var(b, s->position_value_var);
1228          nir_def *pre_cull_w = nir_channel(b, pre_cull_pos, 3);
1229          nir_store_shared(b, pre_cull_w, es_vertex_lds_addr, .base = lds_es_pos_w);
1230          nir_def *pre_cull_x_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 0), pre_cull_w);
1231          nir_def *pre_cull_y_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 1), pre_cull_w);
1232          nir_store_shared(b, nir_vec2(b, pre_cull_x_div_w, pre_cull_y_div_w), es_vertex_lds_addr, .base = lds_es_pos_x);
1233 
1234          /* Clear out the ES accepted flag in LDS */
1235          nir_store_shared(b, nir_imm_zero(b, 1, 8), es_vertex_lds_addr, .align_mul = 4, .base = lds_es_vertex_accepted);
1236 
1237          /* For clipdist culling */
1238          clipdist_culling_es_part(b, s, es_vertex_lds_addr);
1239       }
1240       nir_pop_if(b, if_es_thread);
1241 
1242       nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
1243                             .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
1244 
1245       nir_store_var(b, s->gs_accepted_var, nir_imm_false(b), 0x1u);
1246       nir_store_var(b, s->prim_exp_arg_var, nir_imm_int(b, 1u << 31), 0x1u);
1247 
1248       /* GS invocations load the vertex data and perform the culling. */
1249       nir_if *if_gs_thread = nir_push_if(b, gs_thread);
1250       {
1251          /* Load vertex indices from input VGPRs */
1252          nir_def *vtx_idx[3] = {0};
1253          for (unsigned vertex = 0; vertex < s->options->num_vertices_per_primitive;
1254               ++vertex)
1255             vtx_idx[vertex] = nir_load_var(b, s->gs_vtx_indices_vars[vertex]);
1256 
1257          nir_def *pos[3][4] = {0};
1258 
1259          /* Load W positions of vertices first because the culling code will use these first */
1260          for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) {
1261             s->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes);
1262             pos[vtx][3] = nir_load_shared(b, 1, 32, s->vtx_addr[vtx], .base = lds_es_pos_w);
1263             nir_store_var(b, gs_vtxaddr_vars[vtx], s->vtx_addr[vtx], 0x1u);
1264          }
1265 
1266          /* Load the X/W, Y/W positions of vertices */
1267          for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) {
1268             nir_def *xy = nir_load_shared(b, 2, 32, s->vtx_addr[vtx], .base = lds_es_pos_x);
1269             pos[vtx][0] = nir_channel(b, xy, 0);
1270             pos[vtx][1] = nir_channel(b, xy, 1);
1271          }
1272 
1273          nir_def *accepted_by_clipdist;
1274          if (s->has_clipdist) {
1275             nir_def *clipdist_neg_mask = nir_imm_intN_t(b, 0xff, 8);
1276             for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) {
1277                nir_def *mask =
1278                   nir_load_shared(b, 1, 8, s->vtx_addr[vtx],
1279                                   .base = lds_es_clipdist_neg_mask);
1280                clipdist_neg_mask = nir_iand(b, clipdist_neg_mask, mask);
1281             }
1282             /* primitive is culled if any plane's clipdist of all vertices are negative */
1283             accepted_by_clipdist = nir_ieq_imm(b, clipdist_neg_mask, 0);
1284          } else {
1285             accepted_by_clipdist = nir_imm_true(b);
1286          }
1287 
1288          /* See if the current primitive is accepted */
1289          ac_nir_cull_primitive(b, accepted_by_clipdist, pos,
1290                                s->options->num_vertices_per_primitive,
1291                                cull_primitive_accepted, s);
1292       }
1293       nir_pop_if(b, if_gs_thread);
1294 
1295       nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
1296                             .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
1297 
1298       nir_store_var(b, s->es_accepted_var, nir_imm_false(b), 0x1u);
1299 
1300       /* ES invocations load their accepted flag from LDS. */
1301       if_es_thread = nir_push_if(b, es_thread);
1302       if_es_thread->control = nir_selection_control_divergent_always_taken;
1303       {
1304          nir_def *accepted = nir_load_shared(b, 1, 8u, es_vertex_lds_addr, .base = lds_es_vertex_accepted, .align_mul = 4u);
1305          nir_def *accepted_bool = nir_ine_imm(b, nir_u2u32(b, accepted), 0);
1306          nir_store_var(b, s->es_accepted_var, accepted_bool, 0x1u);
1307       }
1308       nir_pop_if(b, if_es_thread);
1309 
1310       nir_def *es_accepted = nir_load_var(b, s->es_accepted_var);
1311       nir_def *gs_accepted = nir_load_var(b, s->gs_accepted_var);
1312 
1313       /* Repack the vertices (always) and primitives (optional) that survived the culling. */
1314       nir_def *accepted[] = { es_accepted, gs_accepted };
1315       ac_nir_wg_repack_result rep[2] = {0};
1316       const unsigned num_rep = s->options->compact_primitives ? 2 : 1;
1317       ac_nir_repack_invocations_in_workgroup(b, accepted, rep, num_rep, lds_scratch_base,
1318                                       s->max_num_waves, s->options->wave_size);
1319       nir_def *num_live_vertices_in_workgroup = rep[0].num_repacked_invocations;
1320       nir_def *es_exporter_tid = rep[0].repacked_invocation_index;
1321       nir_def *num_exported_prims = NULL;
1322       nir_def *gs_exporter_tid = NULL;
1323 
1324       if (s->options->compact_primitives) {
1325          num_exported_prims = rep[1].num_repacked_invocations;
1326          gs_exporter_tid = rep[1].repacked_invocation_index;
1327       } else {
1328          /* If all vertices are culled, set primitive count to 0 as well. */
1329          nir_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
1330          num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), nir_load_workgroup_num_input_primitives_amd(b));
1331          nir_store_var(b, s->gs_exported_var, nir_iand(b, nir_inot(b, fully_culled), has_input_primitive(b)), 0x1u);
1332       }
1333 
1334       nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0));
1335       {
1336          ac_nir_ngg_alloc_vertices_and_primitives(b, num_live_vertices_in_workgroup, num_exported_prims, s->options->hw_info->has_ngg_fully_culled_bug);
1337       }
1338       nir_pop_if(b, if_wave_0);
1339 
1340       /* Vertex compaction. */
1341       compact_vertices_after_culling(b, s,
1342                                      repacked_variables, gs_vtxaddr_vars,
1343                                      invocation_index, es_vertex_lds_addr,
1344                                      es_exporter_tid, num_live_vertices_in_workgroup,
1345                                      gs_exporter_tid, num_exported_prims,
1346                                      pervertex_lds_bytes, num_repacked_variables);
1347    }
1348    nir_push_else(b, if_cull_en);
1349    {
1350       /* When culling is disabled, we do the same as we would without culling. */
1351       nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0));
1352       {
1353          nir_def *vtx_cnt = nir_load_workgroup_num_input_vertices_amd(b);
1354          nir_def *prim_cnt = nir_load_workgroup_num_input_primitives_amd(b);
1355          ac_nir_ngg_alloc_vertices_and_primitives(b, vtx_cnt, prim_cnt, false);
1356       }
1357       nir_pop_if(b, if_wave_0);
1358       nir_store_var(b, s->prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, s), 0x1u);
1359    }
1360    nir_pop_if(b, if_cull_en);
1361 
1362    /* Update shader arguments.
1363     *
1364     * The registers which hold information about the subgroup's
1365     * vertices and primitives are updated here, so the rest of the shader
1366     * doesn't need to worry about the culling.
1367     *
1368     * These "overwrite" intrinsics must be at top level control flow,
1369     * otherwise they can mess up the backend (eg. ACO's SSA).
1370     *
1371     * TODO:
1372     * A cleaner solution would be to simply replace all usages of these args
1373     * with the load of the variables.
1374     * However, this wouldn't work right now because the backend uses the arguments
1375     * for purposes not expressed in NIR, eg. VS input loads, etc.
1376     * This can change if VS input loads and other stuff are lowered to eg. load_buffer_amd.
1377     */
1378 
1379    if (b->shader->info.stage == MESA_SHADER_VERTEX)
1380       s->overwrite_args =
1381          nir_overwrite_vs_arguments_amd(b,
1382             nir_load_var(b, repacked_variables[0]), nir_load_var(b, repacked_variables[1]));
1383    else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL)
1384       s->overwrite_args =
1385          nir_overwrite_tes_arguments_amd(b,
1386             nir_load_var(b, repacked_variables[0]), nir_load_var(b, repacked_variables[1]),
1387             nir_load_var(b, repacked_variables[2]), nir_load_var(b, s->repacked_rel_patch_id));
1388    else
1389       unreachable("Should be VS or TES.");
1390 }
1391 
1392 static void
ngg_nogs_store_edgeflag_to_lds(nir_builder * b,lower_ngg_nogs_state * s)1393 ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
1394 {
1395    if (!s->out.outputs[VARYING_SLOT_EDGE][0])
1396       return;
1397 
1398    /* clamp user edge flag to 1 for latter bit operations */
1399    nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0];
1400    edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
1401 
1402    /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
1403    unsigned offset = 0;
1404    if (s->streamout_enabled) {
1405       unsigned packed_location =
1406          util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
1407       offset = packed_location * 16;
1408    }
1409 
1410    nir_def *tid = nir_load_local_invocation_index(b);
1411    nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
1412 
1413    nir_store_shared(b, edgeflag, addr, .base = offset);
1414 }
1415 
1416 static void
ngg_nogs_store_xfb_outputs_to_lds(nir_builder * b,lower_ngg_nogs_state * s)1417 ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
1418 {
1419    nir_xfb_info *info = ac_nir_get_sorted_xfb_info(b->shader);
1420 
1421    uint64_t xfb_outputs = 0;
1422    unsigned xfb_outputs_16bit = 0;
1423    uint8_t xfb_mask[VARYING_SLOT_MAX] = {0};
1424    uint8_t xfb_mask_16bit_lo[16] = {0};
1425    uint8_t xfb_mask_16bit_hi[16] = {0};
1426 
1427    /* Get XFB output mask for each slot. */
1428    for (int i = 0; i < info->output_count; i++) {
1429       nir_xfb_output_info *out = info->outputs + i;
1430 
1431       if (out->location < VARYING_SLOT_VAR0_16BIT) {
1432          xfb_outputs |= BITFIELD64_BIT(out->location);
1433          xfb_mask[out->location] |= out->component_mask;
1434       } else {
1435          unsigned index = out->location - VARYING_SLOT_VAR0_16BIT;
1436          xfb_outputs_16bit |= BITFIELD_BIT(index);
1437 
1438          if (out->high_16bits)
1439             xfb_mask_16bit_hi[index] |= out->component_mask;
1440          else
1441             xfb_mask_16bit_lo[index] |= out->component_mask;
1442       }
1443    }
1444 
1445    nir_def *tid = nir_load_local_invocation_index(b);
1446    nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
1447 
1448    u_foreach_bit64(slot, xfb_outputs) {
1449       uint64_t outputs_written = b->shader->info.outputs_written;
1450       if (s->skip_primitive_id)
1451          outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
1452       unsigned packed_location =
1453          util_bitcount64(outputs_written & BITFIELD64_MASK(slot));
1454 
1455       unsigned mask = xfb_mask[slot];
1456 
1457       /* Clear unused components. */
1458       for (unsigned i = 0; i < 4; i++) {
1459          if (!s->out.outputs[slot][i])
1460             mask &= ~BITFIELD_BIT(i);
1461       }
1462 
1463       while (mask) {
1464          int start, count;
1465          u_bit_scan_consecutive_range(&mask, &start, &count);
1466          /* Outputs here are sure to be 32bit.
1467           *
1468           * 64bit outputs have been lowered to two 32bit. As 16bit outputs:
1469           *   Vulkan does not allow streamout outputs less than 32bit.
1470           *   OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
1471           */
1472          nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
1473          nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
1474       }
1475    }
1476 
1477    unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
1478    u_foreach_bit64(slot, xfb_outputs_16bit) {
1479       unsigned packed_location = num_32bit_outputs +
1480          util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
1481 
1482       unsigned mask_lo = xfb_mask_16bit_lo[slot];
1483       unsigned mask_hi = xfb_mask_16bit_hi[slot];
1484 
1485       /* Clear unused components. */
1486       for (unsigned i = 0; i < 4; i++) {
1487          if (!s->out.outputs_16bit_lo[slot][i])
1488             mask_lo &= ~BITFIELD_BIT(i);
1489          if (!s->out.outputs_16bit_hi[slot][i])
1490             mask_hi &= ~BITFIELD_BIT(i);
1491       }
1492 
1493       nir_def **outputs_lo = s->out.outputs_16bit_lo[slot];
1494       nir_def **outputs_hi = s->out.outputs_16bit_hi[slot];
1495       nir_def *undef = nir_undef(b, 1, 16);
1496 
1497       unsigned mask = mask_lo | mask_hi;
1498       while (mask) {
1499          int start, count;
1500          u_bit_scan_consecutive_range(&mask, &start, &count);
1501 
1502          nir_def *values[4] = {0};
1503          for (int c = start; c < start + count; ++c) {
1504             nir_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
1505             nir_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
1506 
1507             /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
1508             values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
1509          }
1510 
1511          nir_def *store_val = nir_vec(b, values, (unsigned)count);
1512          nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
1513       }
1514    }
1515 }
1516 
1517 static void
ngg_nogs_build_streamout(nir_builder * b,lower_ngg_nogs_state * s)1518 ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
1519 {
1520    nir_xfb_info *info = ac_nir_get_sorted_xfb_info(b->shader);
1521 
1522    nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b);
1523 
1524    /* Get global buffer offset where this workgroup will stream out data to. */
1525    nir_def *generated_prim = nir_load_workgroup_num_input_primitives_amd(b);
1526    nir_def *gen_prim_per_stream[4] = {generated_prim, 0, 0, 0};
1527    nir_def *emit_prim_per_stream[4] = {0};
1528    nir_def *buffer_offsets[4] = {0};
1529    nir_def *so_buffer[4] = {0};
1530    nir_def *tid_in_tg = nir_load_local_invocation_index(b);
1531    ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query,
1532                                    s->options->use_gfx12_xfb_intrinsic, lds_scratch_base, tid_in_tg,
1533                                    gen_prim_per_stream,
1534                                    so_buffer, buffer_offsets,
1535                                    emit_prim_per_stream);
1536 
1537    /* Write out primitive data */
1538    nir_if *if_emit = nir_push_if(b, nir_ilt(b, tid_in_tg, emit_prim_per_stream[0]));
1539    {
1540       unsigned vtx_lds_stride = (b->shader->num_outputs * 4 + 1) * 4;
1541       nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b);
1542       nir_def *first_vertex_idx = nir_imul(b, tid_in_tg, num_vert_per_prim);
1543 
1544       u_foreach_bit(buffer, info->buffers_written) {
1545          buffer_offsets[buffer] = nir_iadd(b, buffer_offsets[buffer],
1546                                            nir_imul_imm(b, first_vertex_idx,
1547                                                         info->buffers[buffer].stride));
1548       }
1549 
1550       for (unsigned i = 0; i < s->options->num_vertices_per_primitive; i++) {
1551          nir_if *if_valid_vertex =
1552             nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i));
1553          {
1554             nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
1555             nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride);
1556             ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i,
1557                                        vtx_lds_addr, &s->out, s->skip_primitive_id);
1558          }
1559          nir_pop_if(b, if_valid_vertex);
1560       }
1561    }
1562    nir_pop_if(b, if_emit);
1563 
1564    /* Wait streamout memory ops done before export primitive, otherwise it
1565     * may not finish when shader ends.
1566     *
1567     * If a shader has no param exports, rasterization can start before
1568     * the shader finishes and thus memory stores might not finish before
1569     * the pixel shader starts.
1570     *
1571     * TODO: we only need this when no param exports.
1572     *
1573     * TODO: not sure if we need this barrier when late prim export, as I
1574     *       can't observe test fail without this barrier.
1575     */
1576    nir_scoped_memory_barrier(b, SCOPE_DEVICE, NIR_MEMORY_RELEASE, nir_var_mem_ssbo);
1577 }
1578 
1579 static unsigned
ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,unsigned shader_num_outputs,bool streamout_enabled,bool export_prim_id,bool has_user_edgeflags)1580 ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
1581                                 unsigned shader_num_outputs,
1582                                 bool streamout_enabled,
1583                                 bool export_prim_id,
1584                                 bool has_user_edgeflags)
1585 {
1586    unsigned pervertex_lds_bytes = 0;
1587 
1588    if (streamout_enabled) {
1589       /* The extra dword is used to avoid LDS bank conflicts and store the primitive id.
1590        * TODO: only alloc space for outputs that really need streamout.
1591        */
1592       pervertex_lds_bytes = (shader_num_outputs * 4 + 1) * 4;
1593    }
1594 
1595    bool need_prim_id_store_shared = export_prim_id && stage == MESA_SHADER_VERTEX;
1596    if (need_prim_id_store_shared || has_user_edgeflags) {
1597       unsigned size = 0;
1598       if (need_prim_id_store_shared)
1599          size += 4;
1600       if (has_user_edgeflags)
1601          size += 4;
1602 
1603       /* pad to odd dwords to avoid LDS bank conflict */
1604       size |= 4;
1605 
1606       pervertex_lds_bytes = MAX2(pervertex_lds_bytes, size);
1607    }
1608 
1609    return pervertex_lds_bytes;
1610 }
1611 
1612 static void
ngg_nogs_gather_outputs(nir_builder * b,struct exec_list * cf_list,lower_ngg_nogs_state * s)1613 ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
1614 {
1615    /* Assume:
1616     * - the shader used nir_lower_io_to_temporaries
1617     * - 64-bit outputs are lowered
1618     * - no indirect indexing is present
1619     */
1620    struct nir_cf_node *first_node =
1621       exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
1622 
1623    for (nir_block *block = nir_cf_node_cf_tree_first(first_node); block != NULL;
1624         block = nir_block_cf_tree_next(block)) {
1625       nir_foreach_instr_safe (instr, block) {
1626          if (instr->type != nir_instr_type_intrinsic)
1627             continue;
1628 
1629          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1630          if (intrin->intrinsic != nir_intrinsic_store_output)
1631             continue;
1632 
1633          ac_nir_gather_prerast_store_output_info(b, intrin, &s->out);
1634          nir_instr_remove(instr);
1635       }
1636    }
1637 }
1638 
1639 void
ac_nir_lower_ngg_nogs(nir_shader * shader,const ac_nir_lower_ngg_options * options)1640 ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options)
1641 {
1642    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
1643    assert(impl);
1644    assert(options->max_workgroup_size && options->wave_size);
1645    assert(!(options->can_cull && options->passthrough));
1646 
1647    nir_variable *position_value_var = nir_local_variable_create(impl, glsl_vec4_type(), "position_value");
1648    nir_variable *prim_exp_arg_var = nir_local_variable_create(impl, glsl_uint_type(), "prim_exp_arg");
1649    nir_variable *es_accepted_var =
1650       options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL;
1651    nir_variable *gs_accepted_var =
1652       options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL;
1653    nir_variable *gs_exported_var = nir_local_variable_create(impl, glsl_bool_type(), "gs_exported");
1654 
1655    const bool wait_attr_ring = options->has_param_exports && options->hw_info->has_attr_ring_wait_bug;
1656    bool streamout_enabled = shader->xfb_info && !options->disable_streamout;
1657    bool has_user_edgeflags =
1658       options->use_edgeflags && (shader->info.outputs_written & VARYING_BIT_EDGE);
1659    /* streamout need to be done before either prim or vertex export. Because when no
1660     * param export, rasterization can start right after prim and vertex export,
1661     * which left streamout buffer writes un-finished.
1662     *
1663     * Always use late prim export when user edge flags are enabled.
1664     * This is because edge flags are written by ES threads but they
1665     * are exported by GS threads as part of th primitive export.
1666     *
1667     * When the primitive ID output is configured as a per-primitive,
1668     * and the shader must wait for attribute ring waits before exports,
1669     * we must always use late primitive export.
1670     */
1671    const bool early_prim_export =
1672       options->early_prim_export && !(streamout_enabled || has_user_edgeflags) &&
1673       !(wait_attr_ring && options->export_primitive_id_per_prim);
1674 
1675    lower_ngg_nogs_state state = {
1676       .options = options,
1677       .early_prim_export = early_prim_export,
1678       .streamout_enabled = streamout_enabled,
1679       .position_value_var = position_value_var,
1680       .prim_exp_arg_var = prim_exp_arg_var,
1681       .es_accepted_var = es_accepted_var,
1682       .gs_accepted_var = gs_accepted_var,
1683       .gs_exported_var = gs_exported_var,
1684       .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
1685       .has_user_edgeflags = has_user_edgeflags,
1686       .skip_primitive_id = streamout_enabled && (options->export_primitive_id || options->export_primitive_id_per_prim),
1687    };
1688 
1689    /* Can't export the primitive ID both as per-vertex and per-primitive. */
1690    assert(!options->export_primitive_id || !options->export_primitive_id_per_prim);
1691 
1692    const bool need_prim_id_store_shared =
1693       options->export_primitive_id && shader->info.stage == MESA_SHADER_VERTEX;
1694 
1695    if (options->export_primitive_id) {
1696       shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID;
1697    }
1698 
1699    if (options->export_primitive_id_per_prim) {
1700       /* The HW preloads the primitive ID to VGPRs of GS threads for VS, but not for TES. */
1701       assert(shader->info.stage == MESA_SHADER_VERTEX);
1702       assert(options->hw_info->gfx_level >= GFX10_3);
1703    }
1704 
1705    nir_builder builder = nir_builder_create(impl);
1706    nir_builder *b = &builder; /* This is to avoid the & */
1707 
1708    if (options->can_cull) {
1709       analyze_shader_before_culling(shader, &state);
1710       save_reusable_variables(b, &state);
1711    }
1712 
1713    nir_cf_list extracted;
1714    nir_cf_extract(&extracted, nir_before_impl(impl),
1715                   nir_after_impl(impl));
1716    b->cursor = nir_before_impl(impl);
1717 
1718    ngg_nogs_init_vertex_indices_vars(b, impl, &state);
1719 
1720    /* Emit primitives generated query code here, so that
1721     * it executes before culling and isn't in the extracted CF.
1722     */
1723    nogs_prim_gen_query(b, &state);
1724 
1725    /* Whether a shader invocation should export a primitive,
1726     * initialize to all invocations that have an input primitive.
1727     */
1728    nir_store_var(b, gs_exported_var, has_input_primitive(b), 0x1u);
1729 
1730    if (!options->can_cull) {
1731       /* Newer chips can use PRIMGEN_PASSTHRU_NO_MSG to skip gs_alloc_req for NGG passthrough. */
1732       if (!(options->passthrough && options->hw_info->has_ngg_passthru_no_msg)) {
1733          /* Allocate export space on wave 0 - confirm to the HW that we want to use all possible space */
1734          nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0));
1735          {
1736             nir_def *vtx_cnt = nir_load_workgroup_num_input_vertices_amd(b);
1737             nir_def *prim_cnt = nir_load_workgroup_num_input_primitives_amd(b);
1738             ac_nir_ngg_alloc_vertices_and_primitives(b, vtx_cnt, prim_cnt, false);
1739          }
1740          nir_pop_if(b, if_wave_0);
1741       }
1742 
1743       /* Take care of early primitive export, otherwise just pack the primitive export argument */
1744       if (state.early_prim_export)
1745          emit_ngg_nogs_prim_export(b, &state, NULL);
1746       else
1747          nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, &state), 0x1u);
1748    } else {
1749       add_deferred_attribute_culling(b, &extracted, &state);
1750       b->cursor = nir_after_impl(impl);
1751 
1752       if (state.early_prim_export)
1753          emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, state.prim_exp_arg_var));
1754 
1755       /* Wait for culling to finish using LDS. */
1756       if (need_prim_id_store_shared || has_user_edgeflags) {
1757          nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
1758                                .memory_scope = SCOPE_WORKGROUP,
1759                                .memory_semantics = NIR_MEMORY_ACQ_REL,
1760                                .memory_modes = nir_var_mem_shared);
1761       }
1762    }
1763 
1764    /* determine the LDS vertex stride */
1765    state.pervertex_lds_bytes =
1766       ngg_nogs_get_pervertex_lds_size(shader->info.stage,
1767                                       shader->num_outputs,
1768                                       state.streamout_enabled,
1769                                       options->export_primitive_id,
1770                                       state.has_user_edgeflags);
1771 
1772    if (need_prim_id_store_shared) {
1773       emit_ngg_nogs_prim_id_store_shared(b, &state);
1774 
1775       /* Wait for GS threads to store primitive ID in LDS. */
1776       nir_barrier(b, .execution_scope = SCOPE_WORKGROUP, .memory_scope = SCOPE_WORKGROUP,
1777                             .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared);
1778    } else if (options->export_primitive_id_per_prim && options->hw_info->has_attr_ring) {
1779       emit_ngg_nogs_prim_id_store_per_prim_to_attr_ring(b, &state);
1780    }
1781 
1782    nir_def *es_thread =
1783       options->can_cull ? nir_load_var(b, es_accepted_var) : has_input_vertex(b);
1784 
1785    /* Calculate the bit count here instead of below for lower SGPR usage and better ALU
1786     * scheduling.
1787     */
1788    nir_def *num_es_threads = NULL;
1789    if (options->hw_info->has_attr_ring && options->can_cull) {
1790       nir_def *es_accepted_mask =
1791          nir_ballot(b, 1, options->wave_size, nir_load_var(b, es_accepted_var));
1792       num_es_threads = nir_bit_count(b, es_accepted_mask);
1793    }
1794 
1795    nir_if *if_es_thread = nir_push_if(b, es_thread);
1796    {
1797       /* Run the actual shader */
1798       nir_cf_reinsert(&extracted, b->cursor);
1799       b->cursor = nir_after_cf_list(&if_es_thread->then_list);
1800 
1801       if (options->export_primitive_id)
1802          emit_store_ngg_nogs_es_primitive_id(b, &state);
1803    }
1804    nir_pop_if(b, if_es_thread);
1805 
1806    if (options->can_cull) {
1807       /* Replace uniforms. */
1808       apply_reusable_variables(b, &state);
1809 
1810       /* Remove the redundant position output. */
1811       remove_extra_pos_outputs(shader, &state);
1812 
1813       /* After looking at the performance in apps eg. Doom Eternal, and The Witcher 3,
1814        * it seems that it's best to put the position export always at the end, and
1815        * then let ACO schedule it up (slightly) only when early prim export is used.
1816        */
1817       b->cursor = nir_after_cf_list(&if_es_thread->then_list);
1818 
1819       nir_def *pos_val = nir_load_var(b, state.position_value_var);
1820       for (int i = 0; i < 4; i++)
1821          state.out.outputs[VARYING_SLOT_POS][i] = nir_channel(b, pos_val, i);
1822    }
1823 
1824    /* Gather outputs data and types */
1825    ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
1826    b->cursor = nir_after_cf_list(&if_es_thread->then_list);
1827 
1828    /* This should be after streamout and before exports. */
1829    ac_nir_clamp_vertex_color_outputs(b, &state.out);
1830 
1831    if (state.has_user_edgeflags)
1832       ngg_nogs_store_edgeflag_to_lds(b, &state);
1833 
1834    if (state.streamout_enabled) {
1835       /* TODO: support culling after streamout. */
1836       assert(!options->can_cull);
1837 
1838       ngg_nogs_store_xfb_outputs_to_lds(b, &state);
1839 
1840       b->cursor = nir_after_impl(impl);
1841       ngg_nogs_build_streamout(b, &state);
1842    }
1843 
1844    /* Take care of late primitive export */
1845    nir_if *if_late_prim_export = NULL;
1846    if (!state.early_prim_export) {
1847       b->cursor = nir_after_impl(impl);
1848 
1849       if (wait_attr_ring && options->export_primitive_id_per_prim) {
1850          /* Wait for the per-primitive primitive ID store to finish. */
1851          nir_barrier(b, .execution_scope = SCOPE_SUBGROUP,
1852                         .memory_scope = SCOPE_DEVICE,
1853                         .memory_semantics = NIR_MEMORY_RELEASE,
1854                         .memory_modes = nir_var_mem_ssbo | nir_var_shader_out | nir_var_mem_global | nir_var_image);
1855       }
1856 
1857       if_late_prim_export = emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
1858    }
1859 
1860    uint64_t export_outputs = shader->info.outputs_written | VARYING_BIT_POS;
1861    if (options->kill_pointsize)
1862       export_outputs &= ~VARYING_BIT_PSIZ;
1863    if (options->kill_layer)
1864       export_outputs &= ~VARYING_BIT_LAYER;
1865 
1866    /* If streamout is enabled, export positions after streamout. This increases streamout performance
1867     * for up to 4 vec4 xfb outputs on GFX12 because the streamout code doesn't have go through
1868     * the export allocation bottleneck. Adding more xfb outputs starts to be limited by the memory
1869     * bandwidth.
1870     */
1871    const bool pos_exports_in_cf = state.streamout_enabled || wait_attr_ring;
1872 
1873    nir_if *if_pos_exports = NULL;
1874    if (pos_exports_in_cf) {
1875       b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
1876       ac_nir_create_output_phis(b, b->shader->info.outputs_written, b->shader->info.outputs_written_16bit, &state.out);
1877 
1878       b->cursor = nir_after_impl(impl);
1879       if_pos_exports = nir_push_if(b, es_thread);
1880    } else {
1881       b->cursor = nir_after_cf_list(&if_es_thread->then_list);
1882    }
1883 
1884    ac_nir_export_position(b, options->hw_info->gfx_level,
1885                           options->clip_cull_dist_mask,
1886                           !options->has_param_exports,
1887                           options->force_vrs, true,
1888                           export_outputs, &state.out, NULL);
1889 
1890    if (options->has_param_exports && !options->hw_info->has_attr_ring) {
1891       ac_nir_export_parameters(b, options->vs_output_param_offset,
1892                                b->shader->info.outputs_written,
1893                                b->shader->info.outputs_written_16bit,
1894                                &state.out);
1895    }
1896 
1897    if (if_pos_exports)
1898       nir_pop_if(b, if_pos_exports);
1899 
1900    if (options->has_param_exports && options->hw_info->has_attr_ring) {
1901       if (!pos_exports_in_cf) {
1902          b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
1903          ac_nir_create_output_phis(b, b->shader->info.outputs_written, b->shader->info.outputs_written_16bit, &state.out);
1904       }
1905 
1906       if (!wait_attr_ring)
1907          b->cursor = nir_after_impl(impl);
1908       else if (if_late_prim_export)
1909          b->cursor = nir_after_cf_node_and_phis(&if_late_prim_export->cf_node);
1910       else
1911          b->cursor = nir_after_cf_node_and_phis(&if_es_thread->cf_node);
1912 
1913       if (!num_es_threads)
1914          num_es_threads = nir_load_merged_wave_info_amd(b);
1915 
1916       ac_nir_store_parameters_to_attr_ring(b, options->vs_output_param_offset,
1917                                           b->shader->info.outputs_written,
1918                                           b->shader->info.outputs_written_16bit,
1919                                           &state.out, num_es_threads);
1920 
1921       if (wait_attr_ring) {
1922          /* Wait for attribute ring stores to finish. */
1923          nir_barrier(b, .execution_scope = SCOPE_SUBGROUP,
1924                         .memory_scope = SCOPE_DEVICE,
1925                         .memory_semantics = NIR_MEMORY_RELEASE,
1926                         .memory_modes = nir_var_mem_ssbo | nir_var_shader_out | nir_var_mem_global | nir_var_image);
1927       }
1928    }
1929 
1930    nir_metadata_preserve(impl, nir_metadata_none);
1931    nir_validate_shader(shader, "after emitting NGG VS/TES");
1932 
1933    /* Cleanup */
1934    nir_opt_dead_write_vars(shader);
1935    nir_lower_vars_to_ssa(shader);
1936    nir_remove_dead_variables(shader, nir_var_function_temp, NULL);
1937    nir_lower_alu_to_scalar(shader, NULL, NULL);
1938    nir_lower_phis_to_scalar(shader, true);
1939 
1940    if (options->can_cull) {
1941       /* It's beneficial to redo these opts after splitting the shader. */
1942       nir_opt_sink(shader, nir_move_load_input | nir_move_const_undef | nir_move_copies);
1943       nir_opt_move(shader, nir_move_load_input | nir_move_copies | nir_move_const_undef);
1944    }
1945 
1946    bool progress;
1947    do {
1948       progress = false;
1949       NIR_PASS(progress, shader, nir_opt_undef);
1950       NIR_PASS(progress, shader, nir_opt_dce);
1951       NIR_PASS(progress, shader, nir_opt_dead_cf);
1952 
1953       if (options->can_cull)
1954          progress |= cleanup_culling_shader_after_dce(shader, b->impl, &state);
1955    } while (progress);
1956 }
1957 
1958 unsigned
ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,unsigned shader_num_outputs,bool streamout_enabled,bool export_prim_id,bool has_user_edgeflags,bool can_cull,bool uses_instance_id,bool uses_primitive_id)1959 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
1960                                    unsigned shader_num_outputs,
1961                                    bool streamout_enabled,
1962                                    bool export_prim_id,
1963                                    bool has_user_edgeflags,
1964                                    bool can_cull,
1965                                    bool uses_instance_id,
1966                                    bool uses_primitive_id)
1967 {
1968    /* for culling time lds layout only */
1969    unsigned culling_pervertex_lds_bytes = can_cull ?
1970       ngg_nogs_get_culling_pervertex_lds_size(
1971          stage, uses_instance_id, uses_primitive_id, NULL) : 0;
1972 
1973    unsigned pervertex_lds_bytes =
1974       ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
1975                                       export_prim_id, has_user_edgeflags);
1976 
1977    return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
1978 }
1979 
1980 unsigned
ac_ngg_get_scratch_lds_size(gl_shader_stage stage,unsigned workgroup_size,unsigned wave_size,bool streamout_enabled,bool can_cull,bool compact_primitives)1981 ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
1982                             unsigned workgroup_size,
1983                             unsigned wave_size,
1984                             bool streamout_enabled,
1985                             bool can_cull,
1986                             bool compact_primitives)
1987 {
1988    unsigned scratch_lds_size = 0;
1989    unsigned max_num_waves = DIV_ROUND_UP(workgroup_size, wave_size);
1990 
1991    if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL) {
1992       if (streamout_enabled) {
1993          /* 4 dwords for 4 streamout buffer offset, 1 dword for emit prim count */
1994          scratch_lds_size = 20;
1995       } else if (can_cull) {
1996          /* 1 byte per wave per repack, max 8 waves */
1997          unsigned num_rep = compact_primitives ? 2 : 1;
1998          scratch_lds_size = ALIGN(max_num_waves, 4u) * num_rep;
1999       }
2000    } else {
2001       assert(stage == MESA_SHADER_GEOMETRY);
2002 
2003       scratch_lds_size = ALIGN(max_num_waves, 4u);
2004       /* streamout take 8 dwords for buffer offset and emit vertex per stream */
2005       if (streamout_enabled)
2006          scratch_lds_size = MAX2(scratch_lds_size, 32);
2007    }
2008 
2009    return scratch_lds_size;
2010 }
2011