• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "si_shader_internal.h"
10 
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)11 static bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
12 {
13    return gfx10_has_variable_edgeflags(shader) &&
14           shader->selector->info.writes_edgeflag;
15 }
16 
gfx10_ngg_export_prim_early(struct si_shader * shader)17 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
18 {
19    struct si_shader_selector *sel = shader->selector;
20 
21    assert(shader->key.ge.as_ngg && !shader->key.ge.as_es);
22 
23    return sel->stage != MESA_SHADER_GEOMETRY &&
24           !gfx10_ngg_writes_user_edgeflags(shader) &&
25           sel->screen->info.gfx_level < GFX11;
26 }
27 
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)28 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
29                                      unsigned min_verts_per_prim, bool use_adjacency)
30 {
31    unsigned max_reuse = max_esverts - min_verts_per_prim;
32    if (use_adjacency)
33       max_reuse /= 2;
34    *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
35 }
36 
gfx10_ngg_get_scratch_dw_size(struct si_shader * shader)37 unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
38 {
39    const struct si_shader_selector *sel = shader->selector;
40 
41    return ac_ngg_get_scratch_lds_size(sel->stage,
42                                       si_get_max_workgroup_size(shader),
43                                       shader->wave_size,
44                                       si_shader_uses_streamout(shader),
45                                       si_shader_culling_enabled(shader),
46                                       false) / 4;
47 }
48 
49 /**
50  * Determine subgroup information like maximum number of vertices and prims.
51  *
52  * This happens before the shader is uploaded, since LDS relocations during
53  * upload depend on the subgroup size.
54  */
gfx10_ngg_calculate_subgroup_info(struct si_shader * shader)55 bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
56 {
57    const struct si_shader_selector *gs_sel = shader->selector;
58    const struct si_shader_selector *es_sel =
59       shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
60    const gl_shader_stage gs_stage = gs_sel->stage;
61    const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
62    const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
63    const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
64    const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
65    const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
66 
67    /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
68    const unsigned scratch_lds_size = gfx10_ngg_get_scratch_dw_size(shader);
69    /* Scratch is at last of LDS space and 2 dwords aligned, so it may cost more for alignment. */
70    const unsigned max_lds_size = 16 * 1024 - ALIGN(scratch_lds_size, 2);
71    const unsigned target_lds_size = max_lds_size;
72    unsigned esvert_lds_size = 0;
73    unsigned gsprim_lds_size = 0;
74 
75    /* All these are per subgroup: */
76    const unsigned min_esverts =
77       gs_sel->screen->info.gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
78       gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
79    bool max_vert_out_per_gs_instance = false;
80    unsigned max_gsprims_base, max_esverts_base;
81 
82    max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
83 
84    if (gs_stage == MESA_SHADER_GEOMETRY) {
85       bool force_multi_cycling = false;
86       unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
87 
88 retry_select_mode:
89       if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
90          if (max_out_verts_per_gsprim) {
91             max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
92          }
93       } else {
94          /* Use special multi-cycling mode in which each GS
95           * instance gets its own subgroup. Does not work with
96           * tessellation. */
97          max_vert_out_per_gs_instance = true;
98          max_gsprims_base = 1;
99          max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
100       }
101 
102       esvert_lds_size = es_sel->info.esgs_vertex_stride / 4;
103       gsprim_lds_size = (gs_sel->info.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
104 
105       if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
106          if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
107             force_multi_cycling = true;
108             goto retry_select_mode;
109          }
110       }
111    } else {
112       /* VS and TES. */
113 
114       bool uses_instance_id = gs_sel->info.uses_instanceid;
115       bool uses_primitive_id = gs_sel->info.uses_primid;
116       if (gs_stage == MESA_SHADER_VERTEX) {
117          uses_instance_id |=
118             shader->key.ge.mono.instance_divisor_is_one ||
119             shader->key.ge.mono.instance_divisor_is_fetched;
120       } else {
121          uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
122       }
123 
124       esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
125          gs_stage, gs_sel->info.num_outputs,
126          si_shader_uses_streamout(shader),
127          shader->key.ge.mono.u.vs_export_prim_id,
128          gfx10_ngg_writes_user_edgeflags(shader),
129          si_shader_culling_enabled(shader),
130          uses_instance_id,
131          uses_primitive_id) / 4;
132    }
133 
134    unsigned max_gsprims = max_gsprims_base;
135    unsigned max_esverts = max_esverts_base;
136 
137    if (esvert_lds_size)
138       max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
139    if (gsprim_lds_size)
140       max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
141 
142    max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
143    clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
144    assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
145 
146    if (esvert_lds_size || gsprim_lds_size) {
147       /* Now that we have a rough proportionality between esverts
148        * and gsprims based on the primitive type, scale both of them
149        * down simultaneously based on required LDS space.
150        *
151        * We could be smarter about this if we knew how much vertex
152        * reuse to expect.
153        */
154       unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
155       if (lds_total > target_lds_size) {
156          max_esverts = max_esverts * target_lds_size / lds_total;
157          max_gsprims = max_gsprims * target_lds_size / lds_total;
158 
159          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
160          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
161          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
162       }
163    }
164 
165    /* Round up towards full wave sizes for better ALU utilization. */
166    if (!max_vert_out_per_gs_instance) {
167       unsigned orig_max_esverts;
168       unsigned orig_max_gsprims;
169       do {
170          orig_max_esverts = max_esverts;
171          orig_max_gsprims = max_gsprims;
172 
173          max_esverts = align(max_esverts, shader->wave_size);
174          max_esverts = MIN2(max_esverts, max_esverts_base);
175          if (esvert_lds_size)
176             max_esverts =
177                MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
178          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
179 
180          /* Hardware restriction: minimum value of max_esverts */
181          max_esverts = MAX2(max_esverts, min_esverts);
182 
183          max_gsprims = align(max_gsprims, shader->wave_size);
184          max_gsprims = MIN2(max_gsprims, max_gsprims_base);
185          if (gsprim_lds_size) {
186             /* Don't count unusable vertices to the LDS size. Those are vertices above
187              * the maximum number of vertices that can occur in the workgroup,
188              * which is e.g. max_gsprims * 3 for triangles.
189              */
190             unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
191             max_gsprims =
192                MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
193          }
194          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
195          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
196       } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
197 
198       /* Verify the restriction. */
199       assert(max_esverts >= min_esverts);
200    } else {
201       max_esverts = MAX2(max_esverts, min_esverts);
202    }
203 
204    unsigned max_out_vertices =
205       max_vert_out_per_gs_instance
206          ? gs_sel->info.base.gs.vertices_out
207          : gs_stage == MESA_SHADER_GEOMETRY
208               ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
209               : max_esverts;
210    assert(max_out_vertices <= 256);
211 
212    shader->ngg.hw_max_esverts = max_esverts;
213    shader->ngg.max_gsprims = max_gsprims;
214    shader->ngg.max_out_verts = max_out_vertices;
215    shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
216 
217    /* Don't count unusable vertices. */
218    shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
219                                     esvert_lds_size;
220    shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
221 
222    assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
223 
224    /* If asserts are disabled, we use the same conditions to return false */
225    return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
226           max_out_vertices <= 256 &&
227           shader->ngg.hw_max_esverts >= min_esverts;
228 }
229