1 /*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "ac_llvm_cull.h"
27 #include "si_build_pm4.h"
28 #include "si_pipe.h"
29 #include "si_shader_internal.h"
30 #include "sid.h"
31 #include "util/fast_idiv_by_const.h"
32 #include "util/u_prim.h"
33 #include "util/u_suballoc.h"
34 #include "util/u_upload_mgr.h"
35
36 /* Based on:
37 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
38 */
39
40 /* This file implements primitive culling using asynchronous compute.
41 * It's written to be GL conformant.
42 *
43 * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
44 * in a compute shader. The shader processes 1 primitive/thread by invoking
45 * the VS for each vertex to get the positions, decomposes strips and fans
46 * into triangles (if needed), eliminates primitive restart (if needed),
47 * does (W<0) culling, face culling, view XY culling, zero-area and
48 * small-primitive culling, and generates a new index buffer that doesn't
49 * contain culled primitives.
50 *
51 * The index buffer is generated using the Ordered Count feature of GDS,
52 * which is an atomic counter that is incremented in the wavefront launch
53 * order, so that the original primitive order is preserved.
54 *
55 * Another GDS ordered counter is used to eliminate primitive restart indices.
56 * If a restart index lands on an even thread ID, the compute shader has to flip
57 * the primitive orientation of the whole following triangle strip. The primitive
58 * orientation has to be correct after strip and fan decomposition for two-sided
59 * shading to behave correctly. The decomposition also needs to be aware of
60 * which vertex is the provoking vertex for flat shading to behave correctly.
61 *
62 * IB = a GPU command buffer
63 *
64 * Both the compute and gfx IBs run in parallel sort of like CE and DE.
65 * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
66 * doesn't continue if its word isn't 0x80000000. Once compute shaders are
67 * finished culling, the last wave will write the final primitive count from
68 * GDS directly into the count word of the draw packet in the gfx IB, and
69 * a CS_DONE event will signal the REWIND packet to continue. It's really
70 * a direct draw with command buffer patching from the compute queue.
71 *
72 * The compute IB doesn't have to start when its corresponding gfx IB starts,
73 * but can start sooner. The compute IB is signaled to start after the last
74 * execution barrier in the *previous* gfx IB. This is handled as follows.
75 * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
76 * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
77 * represents the barrier in the previous gfx IB.
78 *
79 * Features:
80 * - Triangle strips and fans are decomposed into an indexed triangle list.
81 * The decomposition differs based on the provoking vertex state.
82 * - Instanced draws are converted into non-instanced draws for 16-bit indices.
83 * (InstanceID is stored in the high bits of VertexID and unpacked by VS)
84 * - Primitive restart is fully supported with triangle strips, including
85 * correct primitive orientation across multiple waves. (restart indices
86 * reset primitive orientation)
87 * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
88 * - Back face culling, incl. culling zero-area / degenerate primitives.
89 * - View XY culling.
90 * - View Z culling (disabled due to limited impact with perspective projection).
91 * - Small primitive culling for all MSAA modes and all quant modes.
92 *
93 * The following are not implemented:
94 * - ClipVertex/ClipDistance/CullDistance-based culling.
95 * - Scissor culling.
96 * - HiZ culling.
97 *
98 * Limitations (and unimplemented features that may be possible to implement):
99 * - Only triangles, triangle strips, and triangle fans are supported.
100 * - Primitive restart is only supported with triangle strips.
101 * - Instancing and primitive restart can't be used together.
102 * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
103 * - The instance divisor buffer is unavailable, so all divisors must be
104 * either 0 or 1.
105 * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
106 * - No support for tessellation and geometry shaders.
107 * (patch elimination where tess factors are 0 would be possible to implement)
108 * - The vertex shader must not contain memory stores.
109 * - All VS resources must not have a write usage in the command buffer.
110 * - Bindless textures and images must not occur in the vertex shader.
111 *
112 * User data SGPR layout:
113 * INDEX_BUFFERS: pointer to constants
114 * 0..3: input index buffer - typed buffer view
115 * 4..7: output index buffer - typed buffer view
116 * 8..11: viewport state - scale.xy, translate.xy
117 * VERTEX_COUNTER: counter address or first primitive ID
118 * - If unordered memory counter: address of "count" in the draw packet
119 * and is incremented atomically by the shader.
120 * - If unordered GDS counter: address of "count" in GDS starting from 0,
121 * must be initialized to 0 before the dispatch.
122 * - If ordered GDS counter: the primitive ID that should reset the vertex
123 * counter to 0 in GDS
124 * LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
125 * count to memory if using GDS ordered append
126 * VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
127 * using GDS ordered append
128 * VS.VERTEX_BUFFERS: same value as VS
129 * VS.CONST_AND_SHADER_BUFFERS: same value as VS
130 * VS.SAMPLERS_AND_IMAGES: same value as VS
131 * VS.BASE_VERTEX: same value as VS
132 * VS.START_INSTANCE: same value as VS
133 * NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
134 * per instance for instancing.
135 * NUM_PRIMS_UDIV_TERMS:
136 * - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
137 * - Bits [5:31]: The number of primitives per instance for computing the remainder.
138 * PRIMITIVE_RESTART_INDEX
139 * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
140 *
141 *
142 * The code contains 3 codepaths:
143 * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
144 * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
145 * - Ordered GDS counter (it preserves the primitive order)
146 *
147 * How to test primitive restart (the most complicated part because it needs
148 * to get the primitive orientation right):
149 * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
150 * primitive orientation flips with small draw calls, which is what most tests use.
151 * You can also enable draw call splitting into draw calls with just 2 primitives.
152 */
153
154 /* At least 256 is needed for the fastest wave launch rate from compute queues
155 * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
156 #define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
157 #define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
158 #define MAX_WAVES_PER_SH 0 /* no limit */
159 #define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
160 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
161 #define CULL_Z 0
162 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
163 #define VERTEX_COUNTER_GDS_MODE 2
164 #define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
165
166 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
167 * draw calls to process by compute before signaling the gfx IB. This reduces the number
168 * of EOP events + REWIND packets, because they decrease performance. */
169 #define PRIMS_PER_BATCH (512 * 1024)
170 /* Draw call splitting at the packet level. This allows signaling the gfx IB
171 * for big draw calls sooner, but doesn't allow context flushes between packets.
172 * Primitive restart is supported. Only implemented for ordered append. */
173 #define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
174 /* If there is not enough ring buffer space for the current IB, split draw calls into
175 * this number of primitives, so that we can flush the context and get free ring space. */
176 #define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
177
178 /* Derived values. */
179 #define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
180 #define SPLIT_PRIMS_PACKET_LEVEL \
181 (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE \
182 : UINT_MAX & ~(THREADGROUP_SIZE - 1))
183
184 #define REWIND_SIGNAL_BIT 0x80000000
185 /* For emulating the rewind packet on CI. */
186 #define FORCE_REWIND_EMULATION 0
187
si_initialize_prim_discard_tunables(struct si_screen * sscreen,bool is_aux_context,unsigned * prim_discard_vertex_count_threshold,unsigned * index_ring_size_per_ib)188 void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
189 unsigned *prim_discard_vertex_count_threshold,
190 unsigned *index_ring_size_per_ib)
191 {
192 *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
193
194 if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
195 !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
196 return;
197
198 /* TODO: enable this after the GDS kernel memory management is fixed */
199 bool enable_on_pro_graphics_by_default = false;
200
201 if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
202 (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
203 (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
204 sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
205 sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
206 sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
207 *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
208
209 if (sscreen->debug_flags & DBG(ALWAYS_PD))
210 *prim_discard_vertex_count_threshold = 0; /* always enable */
211
212 const uint32_t MB = 1024 * 1024;
213 const uint64_t GB = 1024 * 1024 * 1024;
214
215 /* The total size is double this per context.
216 * Greater numbers allow bigger gfx IBs.
217 */
218 if (sscreen->info.vram_size <= 2 * GB)
219 *index_ring_size_per_ib = 64 * MB;
220 else if (sscreen->info.vram_size <= 4 * GB)
221 *index_ring_size_per_ib = 128 * MB;
222 else
223 *index_ring_size_per_ib = 256 * MB;
224 }
225 }
226
227 /* Opcode can be "add" or "swap". */
si_build_ds_ordered_op(struct si_shader_context * ctx,const char * opcode,LLVMValueRef m0,LLVMValueRef value,unsigned ordered_count_index,bool release,bool done)228 static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
229 LLVMValueRef m0, LLVMValueRef value,
230 unsigned ordered_count_index, bool release, bool done)
231 {
232 if (ctx->screen->info.chip_class >= GFX10)
233 ordered_count_index |= 1 << 24; /* number of dwords == 1 */
234
235 LLVMValueRef args[] = {
236 LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
237 value,
238 LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
239 ctx->ac.i32_0, /* scope */
240 ctx->ac.i1false, /* volatile */
241 LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
242 LLVMConstInt(ctx->ac.i1, release, 0),
243 LLVMConstInt(ctx->ac.i1, done, 0),
244 };
245
246 char intrinsic[64];
247 snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
248 return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
249 }
250
si_expand_32bit_pointer(struct si_shader_context * ctx,LLVMValueRef ptr)251 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
252 {
253 uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
254 ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
255 ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
256 return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
257 LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
258 }
259
260 struct si_thread0_section {
261 struct si_shader_context *ctx;
262 LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
263 LLVMValueRef saved_exec;
264 };
265
266 /* Enter a section that only executes on thread 0. */
si_enter_thread0_section(struct si_shader_context * ctx,struct si_thread0_section * section,LLVMValueRef thread_id)267 static void si_enter_thread0_section(struct si_shader_context *ctx,
268 struct si_thread0_section *section, LLVMValueRef thread_id)
269 {
270 section->ctx = ctx;
271 section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
272
273 /* This IF has 4 instructions:
274 * v_and_b32_e32 v, 63, v ; get the thread ID
275 * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
276 * s_and_saveexec_b64 s, vcc
277 * s_cbranch_execz BB0_4
278 *
279 * It could just be s_and_saveexec_b64 s, 1.
280 */
281 ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
282 12601);
283 }
284
285 /* Exit a section that only executes on thread 0 and broadcast the result
286 * to all threads. */
si_exit_thread0_section(struct si_thread0_section * section,LLVMValueRef * result)287 static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
288 {
289 struct si_shader_context *ctx = section->ctx;
290
291 LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
292
293 ac_build_endif(&ctx->ac, 12601);
294
295 /* Broadcast the result from thread 0 to all threads. */
296 *result =
297 ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
298 }
299
si_build_prim_discard_compute_shader(struct si_shader_context * ctx)300 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
301 {
302 struct si_shader_key *key = &ctx->shader->key;
303 LLVMBuilderRef builder = ctx->ac.builder;
304 LLVMValueRef vs = ctx->main_fn;
305
306 /* Always inline the VS function. */
307 ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
308 LLVMSetLinkage(vs, LLVMPrivateLinkage);
309
310 enum ac_arg_type const_desc_type;
311 if (ctx->shader->selector->info.base.num_ubos == 1 &&
312 ctx->shader->selector->info.base.num_ssbos == 0)
313 const_desc_type = AC_ARG_CONST_FLOAT_PTR;
314 else
315 const_desc_type = AC_ARG_CONST_DESC_PTR;
316
317 memset(&ctx->args, 0, sizeof(ctx->args));
318
319 struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
320 struct ac_arg param_vb_desc, param_const_desc;
321 struct ac_arg param_base_vertex, param_start_instance;
322 struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
323 struct ac_arg param_restart_index, param_smallprim_precision;
324 struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
325 struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
326
327 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
328 ¶m_index_buffers_and_constants);
329 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
330 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id);
331 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr);
332 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);
333 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);
334 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);
335 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
336 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
337 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
338 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
339 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index);
340 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
341
342 /* Block ID and thread ID inputs. */
343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
344 if (VERTEX_COUNTER_GDS_MODE == 2)
345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id);
346 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);
347
348 /* Create the compute shader function. */
349 gl_shader_stage old_stage = ctx->stage;
350 ctx->stage = MESA_SHADER_COMPUTE;
351 si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
352 ctx->stage = old_stage;
353
354 if (VERTEX_COUNTER_GDS_MODE == 2) {
355 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
356 } else if (VERTEX_COUNTER_GDS_MODE == 1) {
357 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
358 }
359
360 /* Assemble parameters for VS. */
361 LLVMValueRef vs_params[16];
362 unsigned num_vs_params = 0;
363 unsigned param_vertex_id, param_instance_id;
364
365 vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
366 vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
367 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
368 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
369 vs_params[num_vs_params++] =
370 LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
371 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
372 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
373 vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
374 vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
375
376 vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
377 vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
378 vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
379 vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
380
381 assert(num_vs_params <= ARRAY_SIZE(vs_params));
382 assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
383
384 /* Load descriptors. (load 8 dwords at once) */
385 LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
386
387 LLVMValueRef index_buffers_and_constants =
388 ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
389 tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
390 ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
391 tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
392
393 for (unsigned i = 0; i < 8; i++)
394 desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
395
396 input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
397 output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
398
399 /* Compute PrimID and InstanceID. */
400 LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
401 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
402 ac_get_arg(&ctx->ac, param_local_id));
403 LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
404 LLVMValueRef instance_id = ctx->ac.i32_0;
405
406 if (key->opt.cs_instancing) {
407 LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
408 LLVMValueRef num_prims_udiv_multiplier =
409 ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
410 /* Unpack num_prims_udiv_terms. */
411 LLVMValueRef post_shift =
412 LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
413 LLVMValueRef prims_per_instance =
414 LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
415 /* Divide the total prim_id by the number of prims per instance. */
416 instance_id =
417 ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
418 /* Compute the remainder. */
419 prim_id = LLVMBuildSub(builder, prim_id,
420 LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
421 }
422
423 /* Generate indices (like a non-indexed draw call). */
424 LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
425 unsigned vertices_per_prim = 3;
426
427 switch (key->opt.cs_prim_type) {
428 case PIPE_PRIM_TRIANGLES:
429 for (unsigned i = 0; i < 3; i++) {
430 index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
431 LLVMConstInt(ctx->ac.i32, i, 0));
432 }
433 break;
434 case PIPE_PRIM_TRIANGLE_STRIP:
435 for (unsigned i = 0; i < 3; i++) {
436 index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
437 }
438 break;
439 case PIPE_PRIM_TRIANGLE_FAN:
440 /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
441 * and rasterizer as a normal triangle, so we need to put the provoking
442 * vertex into the correct index variable and preserve orientation at the same time.
443 * gl_VertexID is preserved, because it's equal to the index.
444 */
445 if (key->opt.cs_provoking_vertex_first) {
446 index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
447 index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
448 index[2] = ctx->ac.i32_0;
449 } else {
450 index[0] = ctx->ac.i32_0;
451 index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
452 index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
453 }
454 break;
455 default:
456 unreachable("unexpected primitive type");
457 }
458
459 /* Fetch indices. */
460 if (key->opt.cs_indexed) {
461 for (unsigned i = 0; i < 3; i++) {
462 index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
463 1, 0, true, false);
464 index[i] = ac_to_integer(&ctx->ac, index[i]);
465 }
466 }
467
468 LLVMValueRef ordered_wave_id = NULL;
469
470 /* Extract the ordered wave ID. */
471 if (VERTEX_COUNTER_GDS_MODE == 2) {
472 ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
473 ordered_wave_id =
474 LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
475 ordered_wave_id =
476 LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
477 }
478 LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
479 LLVMConstInt(ctx->ac.i32, 63, 0), "");
480
481 /* Every other triangle in a strip has a reversed vertex order, so we
482 * need to swap vertices of odd primitives to get the correct primitive
483 * orientation when converting triangle strips to triangles. Primitive
484 * restart complicates it, because a strip can start anywhere.
485 */
486 LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
487 LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
488
489 if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
490 /* Without primitive restart, odd primitives have reversed orientation.
491 * Only primitive restart can flip it with respect to the first vertex
492 * of the draw call.
493 */
494 LLVMValueRef first_is_odd = ctx->ac.i1false;
495
496 /* Handle primitive restart. */
497 if (key->opt.cs_primitive_restart) {
498 /* Get the GDS primitive restart continue flag and clear
499 * the flag in vertex_counter. This flag is used when the draw
500 * call was split and we need to load the primitive orientation
501 * flag from GDS for the first wave too.
502 */
503 LLVMValueRef gds_prim_restart_continue =
504 LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
505 gds_prim_restart_continue =
506 LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
507 vertex_counter =
508 LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
509
510 LLVMValueRef index0_is_reset;
511
512 for (unsigned i = 0; i < 3; i++) {
513 LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
514 ac_get_arg(&ctx->ac, param_restart_index), "");
515 if (i == 0)
516 index0_is_reset = LLVMBuildNot(builder, not_reset, "");
517 prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
518 }
519
520 /* If the previous waves flip the primitive orientation
521 * of the current triangle strip, it will be stored in GDS.
522 *
523 * Sometimes the correct orientation is not needed, in which case
524 * we don't need to execute this.
525 */
526 if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
527 /* If there are reset indices in this wave, get the thread index
528 * where the most recent strip starts relative to each thread.
529 */
530 LLVMValueRef preceding_threads_mask =
531 LLVMBuildSub(builder,
532 LLVMBuildShl(builder, ctx->ac.i64_1,
533 LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
534 ctx->ac.i64_1, "");
535
536 LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
537 LLVMValueRef preceding_reset_threadmask =
538 LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
539 LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
540 strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
541
542 /* This flips the orientatino based on reset indices within this wave only. */
543 first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
544
545 LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
546 LLVMValueRef is_first_wave, current_wave_resets_index;
547
548 /* Get the thread index where the last strip starts in this wave.
549 *
550 * If the last strip doesn't start in this wave, the thread index
551 * will be 0.
552 *
553 * If the last strip starts in the next wave, the thread index will
554 * be 64.
555 */
556 last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
557 last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
558
559 struct si_thread0_section section;
560 si_enter_thread0_section(ctx, §ion, thread_id);
561
562 /* This must be done in the thread 0 section, because
563 * we expect PrimID to be 0 for the whole first wave
564 * in this expression.
565 *
566 * NOTE: This will need to be different if we wanna support
567 * instancing with primitive restart.
568 */
569 is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
570 is_first_wave = LLVMBuildAnd(builder, is_first_wave,
571 LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
572 current_wave_resets_index =
573 LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
574
575 ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
576
577 /* Save the last strip start primitive index in GDS and read
578 * the value that previous waves stored.
579 *
580 * if (is_first_wave || current_wave_resets_strip)
581 * // Read the value that previous waves stored and store a new one.
582 * first_is_odd = ds.ordered.swap(last_strip_start);
583 * else
584 * // Just read the value that previous waves stored.
585 * first_is_odd = ds.ordered.add(0);
586 */
587 ac_build_ifcc(
588 &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
589 {
590 /* The GDS address is always 0 with ordered append. */
591 tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
592 false);
593 LLVMBuildStore(builder, tmp, ret);
594 }
595 ac_build_else(&ctx->ac, 12603);
596 {
597 /* Just read the value from GDS. */
598 tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
599 false);
600 LLVMBuildStore(builder, tmp, ret);
601 }
602 ac_build_endif(&ctx->ac, 12602);
603
604 prev_wave_state = LLVMBuildLoad(builder, ret, "");
605 /* Ignore the return value if this is the first wave. */
606 prev_wave_state =
607 LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
608 si_exit_thread0_section(§ion, &prev_wave_state);
609 prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
610
611 /* If the strip start appears to be on thread 0 for the current primitive
612 * (meaning the reset index is not present in this wave and might have
613 * appeared in previous waves), use the value from GDS to determine
614 * primitive orientation.
615 *
616 * If the strip start is in this wave for the current primitive, use
617 * the value from the current wave to determine primitive orientation.
618 */
619 LLVMValueRef strip_start_is0 =
620 LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
621 first_is_odd =
622 LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
623 }
624 }
625 /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
626 LLVMValueRef prim_is_odd = LLVMBuildXor(
627 builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
628
629 /* Convert triangle strip indices to triangle indices. */
630 ac_build_triangle_strip_indices_to_triangle(
631 &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
632 index);
633 }
634
635 /* Execute the vertex shader for each vertex to get vertex positions. */
636 LLVMValueRef pos[3][4];
637 for (unsigned i = 0; i < vertices_per_prim; i++) {
638 vs_params[param_vertex_id] = index[i];
639 vs_params[param_instance_id] = instance_id;
640
641 LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
642 for (unsigned chan = 0; chan < 4; chan++)
643 pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
644 }
645
646 /* Divide XYZ by W. */
647 for (unsigned i = 0; i < vertices_per_prim; i++) {
648 for (unsigned chan = 0; chan < 3; chan++)
649 pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
650 }
651
652 /* Load the viewport state. */
653 LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
654 LLVMConstInt(ctx->ac.i32, 2, 0));
655 vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
656 LLVMValueRef vp_scale[2], vp_translate[2];
657 vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
658 vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
659 vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
660 vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
661
662 /* Do culling. */
663 struct ac_cull_options options = {};
664 options.cull_front = key->opt.cs_cull_front;
665 options.cull_back = key->opt.cs_cull_back;
666 options.cull_view_xy = true;
667 options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
668 options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
669 options.cull_small_prims = true;
670 options.cull_zero_area = true;
671 options.cull_w = true;
672 options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
673
674 LLVMValueRef accepted =
675 ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
676 ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
677
678 ac_build_optimization_barrier(&ctx->ac, &accepted);
679 LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
680
681 /* Count the number of active threads by doing bitcount(accepted). */
682 LLVMValueRef num_prims_accepted = ac_build_intrinsic(
683 &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
684 num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
685
686 LLVMValueRef start;
687
688 /* Execute atomic_add on the vertex count. */
689 struct si_thread0_section section;
690 si_enter_thread0_section(ctx, §ion, thread_id);
691 {
692 if (VERTEX_COUNTER_GDS_MODE == 0) {
693 LLVMValueRef num_indices = LLVMBuildMul(
694 builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
695 vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
696 start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
697 LLVMAtomicOrderingMonotonic, false);
698 } else if (VERTEX_COUNTER_GDS_MODE == 1) {
699 LLVMValueRef num_indices = LLVMBuildMul(
700 builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
701 vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
702 LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
703 start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
704 LLVMAtomicOrderingMonotonic, false);
705 } else if (VERTEX_COUNTER_GDS_MODE == 2) {
706 LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
707
708 /* If the draw call was split into multiple subdraws, each using
709 * a separate draw packet, we need to start counting from 0 for
710 * the first compute wave of the subdraw.
711 *
712 * vertex_counter contains the primitive ID of the first thread
713 * in the first wave.
714 *
715 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
716 */
717 LLVMValueRef is_first_wave =
718 LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
719
720 /* Store the primitive count for ordered append, not vertex count.
721 * The idea is to avoid GDS initialization via CP DMA. The shader
722 * effectively stores the first count using "swap".
723 *
724 * if (first_wave) {
725 * ds.ordered.swap(num_prims_accepted); // store the first primitive count
726 * previous = 0;
727 * } else {
728 * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
729 * }
730 */
731 ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
732 {
733 /* The GDS address is always 0 with ordered append. */
734 si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
735 LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
736 }
737 ac_build_else(&ctx->ac, 12605);
738 {
739 LLVMBuildStore(builder,
740 si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
741 0, true, true),
742 tmp_store);
743 }
744 ac_build_endif(&ctx->ac, 12604);
745
746 start = LLVMBuildLoad(builder, tmp_store, "");
747 }
748 }
749 si_exit_thread0_section(§ion, &start);
750
751 /* Write the final vertex count to memory. An EOS/EOP event could do this,
752 * but those events are super slow and should be avoided if performance
753 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
754 * event like this.
755 */
756 if (VERTEX_COUNTER_GDS_MODE == 2) {
757 ac_build_ifcc(&ctx->ac,
758 LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
759 ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
760 12606);
761 LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
762 count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
763
764 /* GFX8 needs to disable caching, so that the CP can see the stored value.
765 * MTYPE=3 bypasses TC L2.
766 */
767 if (ctx->screen->info.chip_class <= GFX8) {
768 LLVMValueRef desc[] = {
769 ac_get_arg(&ctx->ac, param_vertex_count_addr),
770 LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
771 LLVMConstInt(ctx->ac.i32, 4, 0),
772 LLVMConstInt(
773 ctx->ac.i32,
774 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
775 0),
776 };
777 LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
778 ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
779 ac_glc | ac_slc);
780 } else {
781 LLVMBuildStore(
782 builder, count,
783 si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
784 }
785 ac_build_endif(&ctx->ac, 12606);
786 } else {
787 /* For unordered modes that increment a vertex count instead of
788 * primitive count, convert it into the primitive index.
789 */
790 start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
791 }
792
793 /* Now we need to store the indices of accepted primitives into
794 * the output index buffer.
795 */
796 ac_build_ifcc(&ctx->ac, accepted, 16607);
797 {
798 /* Get the number of bits set before the index of this thread. */
799 LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
800
801 /* We have lowered instancing. Pack the instance ID into vertex ID. */
802 if (key->opt.cs_instancing) {
803 instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
804
805 for (unsigned i = 0; i < vertices_per_prim; i++)
806 index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
807 }
808
809 if (VERTEX_COUNTER_GDS_MODE == 2) {
810 /* vertex_counter contains the first primitive ID
811 * for this dispatch. If the draw call was split into
812 * multiple subdraws, the first primitive ID is > 0
813 * for subsequent subdraws. Each subdraw uses a different
814 * portion of the output index buffer. Offset the store
815 * vindex by the first primitive ID to get the correct
816 * store address for the subdraw.
817 */
818 start = LLVMBuildAdd(builder, start, vertex_counter, "");
819 }
820
821 /* Write indices for accepted primitives. */
822 LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
823 LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
824
825 if (!ac_has_vec3_support(ctx->ac.chip_class, true))
826 vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
827
828 ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
829 ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
830 }
831 ac_build_endif(&ctx->ac, 16607);
832
833 LLVMBuildRetVoid(builder);
834 }
835
836 /* Return false if the shader isn't ready. */
si_shader_select_prim_discard_cs(struct si_context * sctx,const struct pipe_draw_info * info,bool primitive_restart)837 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
838 const struct pipe_draw_info *info,
839 bool primitive_restart)
840 {
841 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
842 struct si_shader_key key;
843
844 /* Primitive restart needs ordered counters. */
845 assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
846 assert(!primitive_restart || info->instance_count == 1);
847
848 memset(&key, 0, sizeof(key));
849 si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
850 assert(!key.part.vs.prolog.instance_divisor_is_fetched);
851
852 key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
853 key.opt.vs_as_prim_discard_cs = 1;
854 key.opt.cs_prim_type = info->mode;
855 key.opt.cs_indexed = info->index_size != 0;
856 key.opt.cs_instancing = info->instance_count > 1;
857 key.opt.cs_primitive_restart = primitive_restart;
858 key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
859
860 /* Primitive restart with triangle strips needs to preserve primitive
861 * orientation for cases where front and back primitive orientation matters.
862 */
863 if (primitive_restart) {
864 struct si_shader_selector *ps = sctx->ps_shader.cso;
865
866 key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
867 ps->info.uses_frontface ||
868 (rs->two_side && ps->info.colors_read);
869 }
870
871 if (rs->rasterizer_discard) {
872 /* Just for performance testing and analysis of trivial bottlenecks.
873 * This should result in a very short compute shader. */
874 key.opt.cs_cull_front = 1;
875 key.opt.cs_cull_back = 1;
876 } else {
877 key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
878 key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
879 }
880
881 if (!rs->depth_clamp_any && CULL_Z) {
882 key.opt.cs_cull_z = 1;
883 key.opt.cs_halfz_clip_space = rs->clip_halfz;
884 }
885
886 sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
887 sctx->cs_prim_discard_state.current = NULL;
888
889 if (!sctx->compiler.passes)
890 si_init_compiler(sctx->screen, &sctx->compiler);
891
892 struct si_compiler_ctx_state compiler_state;
893 compiler_state.compiler = &sctx->compiler;
894 compiler_state.debug = sctx->debug;
895 compiler_state.is_debug_context = sctx->is_debug;
896
897 return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
898 &key, -1, true) == 0 &&
899 /* Disallow compute shaders using the scratch buffer. */
900 sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
901 }
902
si_initialize_prim_discard_cmdbuf(struct si_context * sctx)903 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
904 {
905 if (sctx->index_ring)
906 return true;
907
908 if (!sctx->prim_discard_compute_cs) {
909 struct radeon_winsys *ws = sctx->ws;
910 unsigned gds_size =
911 VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
912 unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
913
914 if (gds_size) {
915 sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS,
916 RADEON_FLAG_DRIVER_INTERNAL);
917 if (!sctx->gds)
918 return false;
919
920 ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
921 }
922 if (num_oa_counters) {
923 assert(gds_size);
924 sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA,
925 RADEON_FLAG_DRIVER_INTERNAL);
926 if (!sctx->gds_oa)
927 return false;
928
929 ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
930 }
931
932 sctx->prim_discard_compute_cs =
933 ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0);
934 if (!sctx->prim_discard_compute_cs)
935 return false;
936 }
937
938 if (!sctx->index_ring) {
939 sctx->index_ring = si_aligned_buffer_create(
940 sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
941 PIPE_USAGE_DEFAULT,
942 sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
943 if (!sctx->index_ring)
944 return false;
945 }
946 return true;
947 }
948
si_check_ring_space(struct si_context * sctx,unsigned out_indexbuf_size)949 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
950 {
951 return sctx->index_ring_offset +
952 align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
953 sctx->index_ring_size_per_ib;
954 }
955
956 enum si_prim_discard_outcome
si_prepare_prim_discard_or_split_draw(struct si_context * sctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count * draws,unsigned num_draws,bool primitive_restart,unsigned total_count)957 si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
958 const struct pipe_draw_start_count *draws,
959 unsigned num_draws, bool primitive_restart,
960 unsigned total_count)
961 {
962 /* If the compute shader compilation isn't finished, this returns false. */
963 if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
964 return SI_PRIM_DISCARD_DISABLED;
965
966 if (!si_initialize_prim_discard_cmdbuf(sctx))
967 return SI_PRIM_DISCARD_DISABLED;
968
969 struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
970 unsigned prim = info->mode;
971 unsigned count = total_count;
972 unsigned instance_count = info->instance_count;
973 unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
974 unsigned num_prims = num_prims_per_instance * instance_count;
975 unsigned out_indexbuf_size = num_prims * 12;
976 bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
977 const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
978
979 /* Split draws at the draw call level if the ring is full. This makes
980 * better use of the ring space.
981 */
982 if (ring_full && num_prims > split_prims_draw_level &&
983 instance_count == 1 && /* TODO: support splitting instanced draws */
984 (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
985 unsigned vert_count_per_subdraw = 0;
986
987 if (prim == PIPE_PRIM_TRIANGLES)
988 vert_count_per_subdraw = split_prims_draw_level * 3;
989 else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
990 vert_count_per_subdraw = split_prims_draw_level;
991 else
992 unreachable("shouldn't get here");
993
994 /* Split multi draws first. */
995 if (num_draws > 1) {
996 unsigned count = 0;
997 unsigned first_draw = 0;
998 unsigned num_draws_split = 0;
999
1000 for (unsigned i = 0; i < num_draws; i++) {
1001 if (count && count + draws[i].count > vert_count_per_subdraw) {
1002 /* Submit previous draws. */
1003 sctx->b.multi_draw(&sctx->b, info, draws + first_draw, num_draws_split);
1004 count = 0;
1005 first_draw = i;
1006 num_draws_split = 0;
1007 }
1008
1009 if (draws[i].count > vert_count_per_subdraw) {
1010 /* Submit just 1 draw. It will be split. */
1011 sctx->b.multi_draw(&sctx->b, info, draws + i, 1);
1012 assert(count == 0);
1013 assert(first_draw == i);
1014 assert(num_draws_split == 0);
1015 first_draw = i + 1;
1016 continue;
1017 }
1018
1019 count += draws[i].count;
1020 num_draws_split++;
1021 }
1022 return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
1023 }
1024
1025 /* Split single draws if splitting multi draws isn't enough. */
1026 struct pipe_draw_info split_draw = *info;
1027 struct pipe_draw_start_count split_draw_range = draws[0];
1028 unsigned base_start = split_draw_range.start;
1029
1030 split_draw.primitive_restart = primitive_restart;
1031
1032 if (prim == PIPE_PRIM_TRIANGLES) {
1033 assert(vert_count_per_subdraw < count);
1034
1035 for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
1036 split_draw_range.start = base_start + start;
1037 split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
1038
1039 sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1);
1040 }
1041 } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
1042 /* No primitive pair can be split, because strips reverse orientation
1043 * for odd primitives. */
1044 STATIC_ASSERT(split_prims_draw_level % 2 == 0);
1045
1046 for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
1047 split_draw_range.start = base_start + start;
1048 split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
1049
1050 sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1);
1051
1052 if (start == 0 && primitive_restart &&
1053 sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
1054 sctx->preserve_prim_restart_gds_at_flush = true;
1055 }
1056 sctx->preserve_prim_restart_gds_at_flush = false;
1057 }
1058
1059 return SI_PRIM_DISCARD_DRAW_SPLIT;
1060 }
1061
1062 /* Just quit if the draw call doesn't fit into the ring and can't be split. */
1063 if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
1064 if (SI_PRIM_DISCARD_DEBUG)
1065 puts("PD failed: draw call too big, can't be split");
1066 return SI_PRIM_DISCARD_DISABLED;
1067 }
1068
1069 unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL) * num_draws;
1070 unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
1071 24 * (num_subdraws - 1) + /* subdraws */
1072 30; /* leave some space at the end */
1073 unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
1074
1075 if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
1076 need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
1077 else
1078 need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
1079
1080 if (ring_full ||
1081 (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
1082 !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
1083 /* If the current IB is empty but the size is too small, add a NOP
1084 * packet to force a flush and get a bigger IB.
1085 */
1086 if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
1087 gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
1088 radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1089 radeon_emit(gfx_cs, 0);
1090 }
1091
1092 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
1093 }
1094
1095 /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
1096 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1097 ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
1098 assert(compute_has_space);
1099 assert(si_check_ring_space(sctx, out_indexbuf_size));
1100 return SI_PRIM_DISCARD_ENABLED;
1101 }
1102
si_compute_signal_gfx(struct si_context * sctx)1103 void si_compute_signal_gfx(struct si_context *sctx)
1104 {
1105 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1106 unsigned writeback_L2_flags = 0;
1107
1108 /* The writeback L2 flags vary with each chip generation. */
1109 /* CI needs to flush vertex indices to memory. */
1110 if (sctx->chip_class <= GFX7)
1111 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
1112 else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
1113 writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
1114
1115 if (!sctx->compute_num_prims_in_batch)
1116 return;
1117
1118 assert(sctx->compute_rewind_va);
1119
1120 /* After the queued dispatches are done and vertex counts are written to
1121 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
1122 * the dispatches to finish, it only adds the CS_DONE event into the event
1123 * queue.
1124 */
1125 si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
1126 sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1127 writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
1128 EOP_DATA_SEL_VALUE_32BIT, NULL,
1129 sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1130 REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
1131 SI_NOT_QUERY);
1132
1133 sctx->compute_rewind_va = 0;
1134 sctx->compute_num_prims_in_batch = 0;
1135 }
1136
1137 /* Dispatch a primitive discard compute shader. */
si_dispatch_prim_discard_cs_and_draw(struct si_context * sctx,const struct pipe_draw_info * info,unsigned count,unsigned index_size,unsigned base_vertex,uint64_t input_indexbuf_va,unsigned input_indexbuf_num_elements)1138 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
1139 const struct pipe_draw_info *info,
1140 unsigned count, unsigned index_size,
1141 unsigned base_vertex, uint64_t input_indexbuf_va,
1142 unsigned input_indexbuf_num_elements)
1143 {
1144 struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
1145 struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
1146 unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
1147 if (!num_prims_per_instance)
1148 return;
1149
1150 unsigned num_prims = num_prims_per_instance * info->instance_count;
1151 unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
1152
1153 switch (info->mode) {
1154 case PIPE_PRIM_TRIANGLES:
1155 case PIPE_PRIM_TRIANGLE_STRIP:
1156 case PIPE_PRIM_TRIANGLE_FAN:
1157 vertices_per_prim = 3;
1158 output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
1159 gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
1160 break;
1161 default:
1162 unreachable("unsupported primitive type");
1163 return;
1164 }
1165
1166 unsigned out_indexbuf_offset;
1167 uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
1168 bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
1169
1170 /* Initialize the compute IB if it's empty. */
1171 if (!sctx->prim_discard_compute_ib_initialized) {
1172 /* 1) State initialization. */
1173 sctx->compute_gds_offset = 0;
1174 sctx->compute_ib_last_shader = NULL;
1175
1176 if (sctx->last_ib_barrier_fence) {
1177 assert(!sctx->last_ib_barrier_buf);
1178 sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
1179 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
1180 }
1181
1182 /* 2) IB initialization. */
1183
1184 /* This needs to be done at the beginning of IBs due to possible
1185 * TTM buffer moves in the kernel.
1186 */
1187 if (sctx->chip_class >= GFX10) {
1188 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
1189 radeon_emit(cs, 0); /* CP_COHER_CNTL */
1190 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
1191 radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
1192 radeon_emit(cs, 0); /* CP_COHER_BASE */
1193 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
1194 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
1195 radeon_emit(cs, /* GCR_CNTL */
1196 S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
1197 S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
1198 S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
1199 } else {
1200 si_emit_surface_sync(sctx, cs,
1201 S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
1202 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
1203 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
1204 S_0085F0_SH_KCACHE_ACTION_ENA(1));
1205 }
1206
1207 /* Restore the GDS prim restart counter if needed. */
1208 if (sctx->preserve_prim_restart_gds_at_flush) {
1209 si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
1210 sctx->wait_mem_scratch, 4);
1211 }
1212
1213 si_emit_initial_compute_regs(sctx, cs);
1214
1215 radeon_set_sh_reg(
1216 cs, R_00B860_COMPUTE_TMPRING_SIZE,
1217 S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
1218
1219 /* Only 1D grids are launched. */
1220 radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
1221 radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
1222 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
1223
1224 radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
1225 radeon_emit(cs, 0);
1226 radeon_emit(cs, 0);
1227
1228 /* Disable ordered alloc for OA resources. */
1229 for (unsigned i = 0; i < 2; i++) {
1230 radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
1231 radeon_emit(cs, S_031074_INDEX(i));
1232 radeon_emit(cs, 0);
1233 radeon_emit(cs, S_03107C_ENABLE(0));
1234 }
1235
1236 if (sctx->last_ib_barrier_buf) {
1237 assert(!sctx->last_ib_barrier_fence);
1238 radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
1239 RADEON_PRIO_FENCE);
1240 si_cp_wait_mem(sctx, cs,
1241 sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
1242 1, 1, WAIT_REG_MEM_EQUAL);
1243 }
1244
1245 sctx->prim_discard_compute_ib_initialized = true;
1246 }
1247
1248 /* Allocate the output index buffer. */
1249 output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
1250 assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
1251 out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
1252 sctx->index_ring_offset += output_indexbuf_size;
1253
1254 radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
1255 RADEON_PRIO_SHADER_RW_BUFFER);
1256 uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
1257
1258 /* Prepare index buffer descriptors. */
1259 struct si_resource *indexbuf_desc = NULL;
1260 unsigned indexbuf_desc_offset;
1261 unsigned desc_size = 12 * 4;
1262 uint32_t *desc;
1263
1264 u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
1265 &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
1266 radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
1267 RADEON_PRIO_DESCRIPTORS);
1268
1269 /* Input index buffer. */
1270 desc[0] = input_indexbuf_va;
1271 desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
1272 desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
1273
1274 if (sctx->chip_class >= GFX10) {
1275 desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1276 S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
1277 : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
1278 : V_008F0C_IMG_FORMAT_32_UINT) |
1279 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1280 S_008F0C_RESOURCE_LEVEL(1);
1281 } else {
1282 desc[3] =
1283 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1284 S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
1285 : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
1286 : V_008F0C_BUF_DATA_FORMAT_32);
1287 }
1288
1289 /* Output index buffer. */
1290 desc[4] = out_indexbuf_va;
1291 desc[5] =
1292 S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
1293 desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
1294
1295 if (sctx->chip_class >= GFX10) {
1296 desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1297 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1298 S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
1299 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
1300 S_008F0C_RESOURCE_LEVEL(1);
1301 } else {
1302 desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1303 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
1304 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
1305 S_008F0C_DATA_FORMAT(output_indexbuf_format);
1306 }
1307
1308 /* Viewport state. */
1309 struct si_small_prim_cull_info cull_info;
1310 si_get_small_prim_cull_info(sctx, &cull_info);
1311
1312 desc[8] = fui(cull_info.scale[0]);
1313 desc[9] = fui(cull_info.scale[1]);
1314 desc[10] = fui(cull_info.translate[0]);
1315 desc[11] = fui(cull_info.translate[1]);
1316
1317 /* Set user data SGPRs. */
1318 /* This can't be greater than 14 if we want the fastest launch rate. */
1319 unsigned user_sgprs = 13;
1320
1321 uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
1322 unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
1323 unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
1324 uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
1325 uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
1326 uint64_t vb_desc_va = sctx->vb_descriptors_buffer
1327 ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
1328 : 0;
1329 unsigned gds_offset, gds_size;
1330 struct si_fast_udiv_info32 num_prims_udiv = {};
1331
1332 if (info->instance_count > 1)
1333 num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
1334
1335 /* Limitations on how these two are packed in the user SGPR. */
1336 assert(num_prims_udiv.post_shift < 32);
1337 assert(num_prims_per_instance < 1 << 27);
1338
1339 si_resource_reference(&indexbuf_desc, NULL);
1340
1341 bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
1342
1343 if (VERTEX_COUNTER_GDS_MODE == 1) {
1344 gds_offset = sctx->compute_gds_offset;
1345 gds_size = primitive_restart ? 8 : 4;
1346 sctx->compute_gds_offset += gds_size;
1347
1348 /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
1349 * The remainder of the GDS will be cleared after the dispatch packet
1350 * in parallel with compute shaders.
1351 */
1352 if (first_dispatch) {
1353 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
1354 radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
1355 radeon_emit(cs, gds_offset);
1356 radeon_emit(cs, 0);
1357 radeon_emit(cs, 0); /* value to write */
1358 if (gds_size == 8)
1359 radeon_emit(cs, 0);
1360 }
1361 }
1362
1363 /* Set shader registers. */
1364 struct si_shader *shader = sctx->cs_prim_discard_state.current;
1365
1366 if (shader != sctx->compute_ib_last_shader) {
1367 radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
1368 RADEON_PRIO_SHADER_BINARY);
1369 uint64_t shader_va = shader->bo->gpu_address;
1370
1371 assert(shader->config.scratch_bytes_per_wave == 0);
1372 assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
1373
1374 radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
1375 radeon_emit(cs, shader_va >> 8);
1376 radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
1377
1378 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
1379 radeon_emit(
1380 cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
1381 S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
1382 S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
1383 S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
1384 S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
1385 radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
1386 S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
1387 S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
1388 S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
1389 S_00B84C_LDS_SIZE(shader->config.lds_size));
1390
1391 radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
1392 ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
1393 MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
1394 sctx->compute_ib_last_shader = shader;
1395 }
1396
1397 STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
1398
1399 /* Big draw calls are split into smaller dispatches and draw packets. */
1400 for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
1401 unsigned num_subdraw_prims;
1402
1403 if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
1404 num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
1405 else
1406 num_subdraw_prims = num_prims - start_prim;
1407
1408 /* Small dispatches are executed back to back until a specific primitive
1409 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
1410 * to start drawing the batch. This batching adds latency to the gfx IB,
1411 * but CS_DONE and REWIND are too slow.
1412 */
1413 if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
1414 si_compute_signal_gfx(sctx);
1415
1416 if (sctx->compute_num_prims_in_batch == 0) {
1417 assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
1418 sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
1419
1420 if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
1421 radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
1422 radeon_emit(gfx_cs, 0);
1423
1424 si_cp_wait_mem(
1425 sctx, gfx_cs,
1426 sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
1427 REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
1428
1429 /* Use INDIRECT_BUFFER to chain to a different buffer
1430 * to discard the CP prefetch cache.
1431 */
1432 sctx->ws->cs_check_space(gfx_cs, 0, true);
1433 } else {
1434 radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
1435 radeon_emit(gfx_cs, 0);
1436 }
1437 }
1438
1439 sctx->compute_num_prims_in_batch += num_subdraw_prims;
1440
1441 uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
1442 uint64_t index_va = out_indexbuf_va + start_prim * 12;
1443
1444 /* Emit the draw packet into the gfx IB. */
1445 radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
1446 radeon_emit(gfx_cs, num_prims * vertices_per_prim);
1447 radeon_emit(gfx_cs, index_va);
1448 radeon_emit(gfx_cs, index_va >> 32);
1449 radeon_emit(gfx_cs, 0);
1450 radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
1451
1452 /* Continue with the compute IB. */
1453 if (start_prim == 0) {
1454 uint32_t gds_prim_restart_continue_bit = 0;
1455
1456 if (sctx->preserve_prim_restart_gds_at_flush) {
1457 assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
1458 assert(start_prim < 1 << 31);
1459 gds_prim_restart_continue_bit = 1 << 31;
1460 }
1461
1462 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
1463 radeon_emit(cs, index_buffers_va);
1464 radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
1465 ? count_va
1466 : VERTEX_COUNTER_GDS_MODE == 1
1467 ? gds_offset
1468 : start_prim | gds_prim_restart_continue_bit);
1469 radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1470 radeon_emit(cs, count_va);
1471 radeon_emit(cs, vb_desc_va);
1472 radeon_emit(cs, vs_const_desc_va);
1473 radeon_emit(cs, vs_sampler_desc_va);
1474 radeon_emit(cs, base_vertex);
1475 radeon_emit(cs, info->start_instance);
1476 radeon_emit(cs, num_prims_udiv.multiplier);
1477 radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
1478 radeon_emit(cs, info->restart_index);
1479 /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
1480 radeon_emit(cs, fui(cull_info.small_prim_precision));
1481 } else {
1482 assert(VERTEX_COUNTER_GDS_MODE == 2);
1483 /* Only update the SGPRs that changed. */
1484 radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
1485 radeon_emit(cs, start_prim);
1486 radeon_emit(cs, start_prim + num_subdraw_prims - 1);
1487 radeon_emit(cs, count_va);
1488 }
1489
1490 /* Set grid dimensions. */
1491 unsigned start_block = start_prim / THREADGROUP_SIZE;
1492 unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
1493 unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
1494
1495 radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
1496 radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
1497 S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
1498 S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
1499
1500 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1501 radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
1502 radeon_emit(cs, 1);
1503 radeon_emit(cs, 1);
1504 radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
1505 S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
1506 S_00B800_ORDER_MODE(0 /* launch in order */));
1507
1508 /* This is only for unordered append. Ordered append writes this from
1509 * the shader.
1510 *
1511 * Note that EOP and EOS events are super slow, so emulating the event
1512 * in a shader is an important optimization.
1513 */
1514 if (VERTEX_COUNTER_GDS_MODE == 1) {
1515 si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
1516 sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
1517 EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
1518 count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
1519 EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
1520
1521 /* Now that compute shaders are running, clear the remainder of GDS. */
1522 if (first_dispatch) {
1523 unsigned offset = gds_offset + gds_size;
1524 si_cp_dma_clear_buffer(
1525 sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
1526 SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE,
1527 SI_COHERENCY_NONE, L2_BYPASS);
1528 }
1529 }
1530 first_dispatch = false;
1531
1532 assert(cs->current.cdw <= cs->current.max_dw);
1533 assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
1534 }
1535 }
1536