1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "si_shader_internal.h"
10
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)11 static bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
12 {
13 return gfx10_has_variable_edgeflags(shader) &&
14 shader->selector->info.writes_edgeflag;
15 }
16
gfx10_ngg_export_prim_early(struct si_shader * shader)17 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
18 {
19 struct si_shader_selector *sel = shader->selector;
20
21 assert(shader->key.ge.as_ngg && !shader->key.ge.as_es);
22
23 return sel->stage != MESA_SHADER_GEOMETRY &&
24 !gfx10_ngg_writes_user_edgeflags(shader);
25 }
26
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)27 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
28 unsigned min_verts_per_prim, bool use_adjacency)
29 {
30 unsigned max_reuse = max_esverts - min_verts_per_prim;
31 if (use_adjacency)
32 max_reuse /= 2;
33 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
34 }
35
gfx10_ngg_get_scratch_dw_size(struct si_shader * shader)36 unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
37 {
38 const struct si_shader_selector *sel = shader->selector;
39
40 return ac_ngg_get_scratch_lds_size(sel->stage,
41 si_get_max_workgroup_size(shader),
42 shader->wave_size,
43 si_shader_uses_streamout(shader),
44 si_shader_culling_enabled(shader),
45 false) / 4;
46 }
47
48 /**
49 * Determine subgroup information like maximum number of vertices and prims.
50 *
51 * This happens before the shader is uploaded, since LDS relocations during
52 * upload depend on the subgroup size.
53 */
gfx10_ngg_calculate_subgroup_info(struct si_shader * shader)54 bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
55 {
56 const struct si_shader_selector *gs_sel = shader->selector;
57 const struct si_shader_selector *es_sel =
58 shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
59 const gl_shader_stage gs_stage = gs_sel->stage;
60 const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
61 const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
62 const bool use_adjacency = mesa_prim_has_adjacency(input_prim);
63 const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
64 const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
65
66 /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
67 const unsigned scratch_lds_size = gfx10_ngg_get_scratch_dw_size(shader);
68 /* Scratch is at last of LDS space and 2 dwords aligned, so it may cost more for alignment. */
69 const unsigned max_lds_size = 16 * 1024 - ALIGN(scratch_lds_size, 2);
70 const unsigned target_lds_size = max_lds_size;
71 unsigned esvert_lds_size = 0;
72 unsigned gsprim_lds_size = 0;
73
74 /* All these are per subgroup: */
75 const unsigned min_esverts =
76 gs_sel->screen->info.gfx_level >= GFX11 ? max_verts_per_prim : /* gfx11 requires at least 1 primitive per TG */
77 gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
78 bool max_vert_out_per_gs_instance = false;
79 unsigned max_gsprims_base, max_esverts_base;
80
81 max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
82
83 if (gs_stage == MESA_SHADER_GEOMETRY) {
84 bool force_multi_cycling = false;
85 unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
86
87 retry_select_mode:
88 if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
89 if (max_out_verts_per_gsprim) {
90 max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
91 }
92 } else {
93 /* Use special multi-cycling mode in which each GS
94 * instance gets its own subgroup. Does not work with
95 * tessellation. */
96 max_vert_out_per_gs_instance = true;
97 max_gsprims_base = 1;
98 max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
99 }
100
101 esvert_lds_size = es_sel->info.esgs_vertex_stride / 4;
102 gsprim_lds_size = (gs_sel->info.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
103
104 if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
105 if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
106 force_multi_cycling = true;
107 goto retry_select_mode;
108 }
109 }
110 } else {
111 /* VS and TES. */
112
113 bool uses_instance_id = gs_sel->info.uses_instanceid;
114 bool uses_primitive_id = gs_sel->info.uses_primid;
115 if (gs_stage == MESA_SHADER_VERTEX) {
116 uses_instance_id |=
117 shader->key.ge.mono.instance_divisor_is_one ||
118 shader->key.ge.mono.instance_divisor_is_fetched;
119 } else {
120 uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
121 }
122
123 esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
124 gs_stage, gs_sel->info.num_outputs,
125 si_shader_uses_streamout(shader),
126 shader->key.ge.mono.u.vs_export_prim_id,
127 gfx10_ngg_writes_user_edgeflags(shader),
128 si_shader_culling_enabled(shader),
129 uses_instance_id,
130 uses_primitive_id) / 4;
131 }
132
133 unsigned max_gsprims = max_gsprims_base;
134 unsigned max_esverts = max_esverts_base;
135
136 if (esvert_lds_size)
137 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
138 if (gsprim_lds_size)
139 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
140
141 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
142 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
143 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
144
145 if (esvert_lds_size || gsprim_lds_size) {
146 /* Now that we have a rough proportionality between esverts
147 * and gsprims based on the primitive type, scale both of them
148 * down simultaneously based on required LDS space.
149 *
150 * We could be smarter about this if we knew how much vertex
151 * reuse to expect.
152 */
153 unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
154 if (lds_total > target_lds_size) {
155 max_esverts = max_esverts * target_lds_size / lds_total;
156 max_gsprims = max_gsprims * target_lds_size / lds_total;
157
158 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
159 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
160 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
161 }
162 }
163
164 /* Round up towards full wave sizes for better ALU utilization. */
165 if (!max_vert_out_per_gs_instance) {
166 unsigned orig_max_esverts;
167 unsigned orig_max_gsprims;
168 do {
169 orig_max_esverts = max_esverts;
170 orig_max_gsprims = max_gsprims;
171
172 max_esverts = align(max_esverts, shader->wave_size);
173 max_esverts = MIN2(max_esverts, max_esverts_base);
174 if (esvert_lds_size)
175 max_esverts =
176 MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
177 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
178
179 /* Hardware restriction: minimum value of max_esverts */
180 max_esverts = MAX2(max_esverts, min_esverts);
181
182 max_gsprims = align(max_gsprims, shader->wave_size);
183 max_gsprims = MIN2(max_gsprims, max_gsprims_base);
184 if (gsprim_lds_size) {
185 /* Don't count unusable vertices to the LDS size. Those are vertices above
186 * the maximum number of vertices that can occur in the workgroup,
187 * which is e.g. max_gsprims * 3 for triangles.
188 */
189 unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
190 max_gsprims =
191 MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
192 }
193 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
194 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
195 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
196
197 /* Verify the restriction. */
198 assert(max_esverts >= min_esverts);
199 } else {
200 max_esverts = MAX2(max_esverts, min_esverts);
201 }
202
203 unsigned max_out_vertices =
204 max_vert_out_per_gs_instance
205 ? gs_sel->info.base.gs.vertices_out
206 : gs_stage == MESA_SHADER_GEOMETRY
207 ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
208 : max_esverts;
209 assert(max_out_vertices <= 256);
210
211 shader->ngg.hw_max_esverts = max_esverts;
212 shader->ngg.max_gsprims = max_gsprims;
213 shader->ngg.max_out_verts = max_out_vertices;
214 shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
215
216 /* Don't count unusable vertices. */
217 shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
218 esvert_lds_size;
219 shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
220
221 assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
222
223 /* If asserts are disabled, we use the same conditions to return false */
224 return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
225 max_out_vertices <= 256 &&
226 shader->ngg.hw_max_esverts >= min_esverts;
227 }
228