1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29
si_is_es_thread(struct si_shader_context * ctx)30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32 /* Return true if the current thread should execute an ES thread. */
33 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34 si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
35 }
36
si_is_gs_thread(struct si_shader_context * ctx)37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38 {
39 /* Return true if the current thread should execute a GS thread. */
40 return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41 si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
42 }
43
si_llvm_load_input_gs(struct ac_shader_abi * abi,unsigned input_index,unsigned vtx_offset_param,LLVMTypeRef type,unsigned swizzle)44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45 unsigned vtx_offset_param, LLVMTypeRef type,
46 unsigned swizzle)
47 {
48 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49 struct si_shader *shader = ctx->shader;
50 LLVMValueRef vtx_offset, soffset;
51 struct si_shader_info *info = &shader->selector->info;
52 unsigned param;
53 LLVMValueRef value;
54
55 param = si_shader_io_get_unique_index(info->input_semantic[input_index], false);
56
57 /* GFX9 has the ESGS ring in LDS. */
58 if (ctx->screen->info.chip_class >= GFX9) {
59 unsigned index = vtx_offset_param;
60
61 switch (index / 2) {
62 case 0:
63 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
64 break;
65 case 1:
66 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
67 break;
68 case 2:
69 vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
70 break;
71 default:
72 assert(0);
73 return NULL;
74 }
75
76 unsigned offset = param * 4 + swizzle;
77 vtx_offset =
78 LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
79
80 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
81 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
82 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
83 }
84
85 /* GFX6: input load from the ESGS ring in memory. */
86 /* Get the vertex offset parameter on GFX6. */
87 LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
88
89 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
90
91 soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
92
93 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
94 ac_glc, true, false);
95 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
96 }
97
si_nir_load_input_gs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)98 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
99 unsigned driver_location, unsigned component,
100 unsigned num_components, unsigned vertex_index,
101 LLVMTypeRef type)
102 {
103 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
104
105 LLVMValueRef value[4];
106 for (unsigned i = component; i < component + num_components; i++) {
107 value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location,
108 vertex_index, type, i);
109 }
110
111 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
112 }
113
114 /* Pass GS inputs from ES to GS on GFX9. */
si_set_es_return_value_for_gs(struct si_shader_context * ctx)115 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
116 {
117 LLVMValueRef ret = ctx->return_value;
118
119 ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
120 ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
121 if (ctx->shader->key.as_ngg)
122 ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
123 else
124 ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
125 ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
126 ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
127
128 ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
129 ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
130 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
131 if (ctx->screen->use_ngg) {
132 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
133 }
134
135 unsigned vgpr;
136 if (ctx->stage == MESA_SHADER_VERTEX)
137 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
138 else
139 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
140
141 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
142 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
143 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
144 ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
145 ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
146 ctx->return_value = ret;
147 }
148
si_llvm_emit_es_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)149 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
150 {
151 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
152 struct si_shader *es = ctx->shader;
153 struct si_shader_info *info = &es->selector->info;
154 LLVMValueRef lds_base = NULL;
155 unsigned chan;
156 int i;
157
158 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
159 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
160 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
161 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
162 vertex_idx =
163 LLVMBuildOr(ctx->ac.builder, vertex_idx,
164 LLVMBuildMul(ctx->ac.builder, wave_idx,
165 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
166 "");
167 lds_base =
168 LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
169 }
170
171 for (i = 0; i < info->num_outputs; i++) {
172 int param;
173
174 if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
175 info->output_semantic[i] == VARYING_SLOT_LAYER)
176 continue;
177
178 param = si_shader_io_get_unique_index(info->output_semantic[i], false);
179
180 for (chan = 0; chan < 4; chan++) {
181 if (!(info->output_usagemask[i] & (1 << chan)))
182 continue;
183
184 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
185 out_val = ac_to_integer(&ctx->ac, out_val);
186
187 /* GFX9 has the ESGS ring in LDS. */
188 if (ctx->screen->info.chip_class >= GFX9) {
189 LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
190 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
191 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
192 continue;
193 }
194
195 ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
196 ac_get_arg(&ctx->ac, ctx->es2gs_offset),
197 (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
198 }
199 }
200
201 if (ctx->screen->info.chip_class >= GFX9)
202 si_set_es_return_value_for_gs(ctx);
203 }
204
si_get_gs_wave_id(struct si_shader_context * ctx)205 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
206 {
207 if (ctx->screen->info.chip_class >= GFX9)
208 return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
209 else
210 return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
211 }
212
emit_gs_epilogue(struct si_shader_context * ctx)213 static void emit_gs_epilogue(struct si_shader_context *ctx)
214 {
215 if (ctx->shader->key.as_ngg) {
216 gfx10_ngg_gs_emit_epilogue(ctx);
217 return;
218 }
219
220 if (ctx->screen->info.chip_class >= GFX10)
221 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
222
223 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
224
225 if (ctx->screen->info.chip_class >= GFX9)
226 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
227 }
228
si_llvm_emit_gs_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)229 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
230 LLVMValueRef *addrs)
231 {
232 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
233 struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
234
235 assert(info->num_outputs <= max_outputs);
236
237 emit_gs_epilogue(ctx);
238 }
239
240 /* Emit one vertex from the geometry shader */
si_llvm_emit_vertex(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef * addrs)241 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
242 {
243 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
244
245 if (ctx->shader->key.as_ngg) {
246 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
247 return;
248 }
249
250 struct si_shader_info *info = &ctx->shader->selector->info;
251 struct si_shader *shader = ctx->shader;
252 LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
253 LLVMValueRef gs_next_vertex;
254 LLVMValueRef can_emit;
255 unsigned chan, offset;
256 int i;
257
258 /* Write vertex attribute values to GSVS ring */
259 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
260
261 /* If this thread has already emitted the declared maximum number of
262 * vertices, skip the write: excessive vertex emissions are not
263 * supposed to have any effect.
264 *
265 * If the shader has no writes to memory, kill it instead. This skips
266 * further memory loads and may allow LLVM to skip to the end
267 * altogether.
268 */
269 can_emit =
270 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
271 LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
272
273 bool use_kill = !info->base.writes_memory;
274 if (use_kill) {
275 ac_build_kill_if_false(&ctx->ac, can_emit);
276 } else {
277 ac_build_ifcc(&ctx->ac, can_emit, 6505);
278 }
279
280 offset = 0;
281 for (i = 0; i < info->num_outputs; i++) {
282 for (chan = 0; chan < 4; chan++) {
283 if (!(info->output_usagemask[i] & (1 << chan)) ||
284 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
285 continue;
286
287 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
288 LLVMValueRef voffset =
289 LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
290 offset++;
291
292 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
293 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
294
295 out_val = ac_to_integer(&ctx->ac, out_val);
296
297 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
298 0, ac_glc | ac_slc | ac_swizzled);
299 }
300 }
301
302 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
303 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
304
305 /* Signal vertex emission if vertex data was written. */
306 if (offset) {
307 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
308 si_get_gs_wave_id(ctx));
309 }
310
311 if (!use_kill)
312 ac_build_endif(&ctx->ac, 6505);
313 }
314
315 /* Cut one primitive from the geometry shader */
si_llvm_emit_primitive(struct ac_shader_abi * abi,unsigned stream)316 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
317 {
318 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
319
320 if (ctx->shader->key.as_ngg) {
321 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
322 return;
323 }
324
325 /* Signal primitive cut */
326 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
327 si_get_gs_wave_id(ctx));
328 }
329
si_preload_esgs_ring(struct si_shader_context * ctx)330 void si_preload_esgs_ring(struct si_shader_context *ctx)
331 {
332 if (ctx->screen->info.chip_class <= GFX8) {
333 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
334 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
335 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
336
337 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
338 } else {
339 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
340 /* Declare the ESGS ring as an explicit LDS symbol. */
341 si_llvm_declare_esgs_ring(ctx);
342 } else {
343 ac_declare_lds_as_pointer(&ctx->ac);
344 ctx->esgs_ring = ctx->ac.lds;
345 }
346 }
347 }
348
si_preload_gs_rings(struct si_shader_context * ctx)349 void si_preload_gs_rings(struct si_shader_context *ctx)
350 {
351 const struct si_shader_selector *sel = ctx->shader->selector;
352 LLVMBuilderRef builder = ctx->ac.builder;
353 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
354 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
355 LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
356
357 /* The conceptual layout of the GSVS ring is
358 * v0c0 .. vLv0 v0c1 .. vLc1 ..
359 * but the real memory layout is swizzled across
360 * threads:
361 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
362 * t16v0c0 ..
363 * Override the buffer descriptor accordingly.
364 */
365 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
366 uint64_t stream_offset = 0;
367
368 for (unsigned stream = 0; stream < 4; ++stream) {
369 unsigned num_components;
370 unsigned stride;
371 unsigned num_records;
372 LLVMValueRef ring, tmp;
373
374 num_components = sel->info.num_stream_output_components[stream];
375 if (!num_components)
376 continue;
377
378 stride = 4 * num_components * sel->info.base.gs.vertices_out;
379
380 /* Limit on the stride field for <= GFX7. */
381 assert(stride < (1 << 14));
382
383 num_records = ctx->ac.wave_size;
384
385 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
386 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
387 tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
388 stream_offset += stride * ctx->ac.wave_size;
389
390 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
391 ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
392 tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
393 tmp = LLVMBuildOr(
394 builder, tmp,
395 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
396 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
397 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
398 LLVMConstInt(ctx->ac.i32, 2, 0), "");
399
400 uint32_t rsrc3 =
401 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
402 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
403 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
404 S_008F0C_ADD_TID_ENABLE(1);
405
406 if (ctx->ac.chip_class >= GFX10) {
407 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
408 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
409 } else {
410 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
411 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
412 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
413 }
414
415 ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
416 LLVMConstInt(ctx->ac.i32, 3, 0), "");
417
418 ctx->gsvs_ring[stream] = ring;
419 }
420 }
421
422 /* Generate code for the hardware VS shader stage to go with a geometry shader */
si_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader_selector * gs_selector,struct pipe_debug_callback * debug)423 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
424 struct ac_llvm_compiler *compiler,
425 struct si_shader_selector *gs_selector,
426 struct pipe_debug_callback *debug)
427 {
428 struct si_shader_context ctx;
429 struct si_shader *shader;
430 LLVMBuilderRef builder;
431 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
432 struct si_shader_info *gsinfo = &gs_selector->info;
433 int i;
434
435 shader = CALLOC_STRUCT(si_shader);
436 if (!shader)
437 return NULL;
438
439 /* We can leave the fence as permanently signaled because the GS copy
440 * shader only becomes visible globally after it has been compiled. */
441 util_queue_fence_init(&shader->ready);
442
443 shader->selector = gs_selector;
444 shader->is_gs_copy_shader = true;
445
446 si_llvm_context_init(&ctx, sscreen, compiler,
447 si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
448 false, false, false, false));
449 ctx.shader = shader;
450 ctx.stage = MESA_SHADER_VERTEX;
451
452 builder = ctx.ac.builder;
453
454 si_create_function(&ctx, false);
455
456 LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
457 ctx.gsvs_ring[0] =
458 ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
459
460 LLVMValueRef voffset =
461 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
462
463 /* Fetch the vertex stream ID.*/
464 LLVMValueRef stream_id;
465
466 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
467 stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
468 else
469 stream_id = ctx.ac.i32_0;
470
471 /* Fill in output information. */
472 for (i = 0; i < gsinfo->num_outputs; ++i) {
473 outputs[i].semantic = gsinfo->output_semantic[i];
474
475 for (int chan = 0; chan < 4; chan++) {
476 outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
477 }
478 }
479
480 LLVMBasicBlockRef end_bb;
481 LLVMValueRef switch_inst;
482
483 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
484 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
485
486 for (int stream = 0; stream < 4; stream++) {
487 LLVMBasicBlockRef bb;
488 unsigned offset;
489
490 if (!gsinfo->num_stream_output_components[stream])
491 continue;
492
493 if (stream > 0 && !gs_selector->so.num_outputs)
494 continue;
495
496 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
497 LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
498 LLVMPositionBuilderAtEnd(builder, bb);
499
500 /* Fetch vertex data from GSVS ring */
501 offset = 0;
502 for (i = 0; i < gsinfo->num_outputs; ++i) {
503 for (unsigned chan = 0; chan < 4; chan++) {
504 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
505 outputs[i].vertex_stream[chan] != stream) {
506 outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
507 continue;
508 }
509
510 LLVMValueRef soffset =
511 LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
512 offset++;
513
514 outputs[i].values[chan] =
515 ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
516 ac_glc | ac_slc, true, false);
517 }
518 }
519
520 /* Streamout and exports. */
521 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
522 si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
523 }
524
525 if (stream == 0)
526 si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
527
528 LLVMBuildBr(builder, end_bb);
529 }
530
531 LLVMPositionBuilderAtEnd(builder, end_bb);
532
533 LLVMBuildRetVoid(ctx.ac.builder);
534
535 ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
536 si_llvm_optimize_module(&ctx);
537
538 bool ok = false;
539 if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
540 debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
541 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
542 fprintf(stderr, "GS Copy Shader:\n");
543 si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
544
545 if (!ctx.shader->config.scratch_bytes_per_wave)
546 ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
547 else
548 ok = true;
549 }
550
551 si_llvm_dispose(&ctx);
552
553 if (!ok) {
554 FREE(shader);
555 shader = NULL;
556 } else {
557 si_fix_resource_usage(sscreen, shader);
558 }
559 return shader;
560 }
561
562 /**
563 * Build the GS prolog function. Rotate the input vertices for triangle strips
564 * with adjacency.
565 */
si_llvm_build_gs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)566 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
567 {
568 unsigned num_sgprs, num_vgprs;
569 LLVMBuilderRef builder = ctx->ac.builder;
570 LLVMTypeRef returns[AC_MAX_ARGS];
571 LLVMValueRef func, ret;
572
573 memset(&ctx->args, 0, sizeof(ctx->args));
574
575 if (ctx->screen->info.chip_class >= GFX9) {
576 if (key->gs_prolog.states.gfx9_prev_is_vs)
577 num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
578 else
579 num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
580 num_vgprs = 5; /* ES inputs are not needed by GS */
581 } else {
582 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
583 num_vgprs = 8;
584 }
585
586 for (unsigned i = 0; i < num_sgprs; ++i) {
587 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
588 returns[i] = ctx->ac.i32;
589 }
590
591 for (unsigned i = 0; i < num_vgprs; ++i) {
592 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
593 returns[num_sgprs + i] = ctx->ac.f32;
594 }
595
596 /* Create the function. */
597 si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
598 func = ctx->main_fn;
599
600 /* Set the full EXEC mask for the prolog, because we are only fiddling
601 * with registers here. The main shader part will set the correct EXEC
602 * mask.
603 */
604 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
605 ac_init_exec_full_mask(&ctx->ac);
606
607 /* Copy inputs to outputs. This should be no-op, as the registers match,
608 * but it will prevent the compiler from overwriting them unintentionally.
609 */
610 ret = ctx->return_value;
611 for (unsigned i = 0; i < num_sgprs; i++) {
612 LLVMValueRef p = LLVMGetParam(func, i);
613 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
614 }
615 for (unsigned i = 0; i < num_vgprs; i++) {
616 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
617 p = ac_to_float(&ctx->ac, p);
618 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
619 }
620
621 if (key->gs_prolog.states.tri_strip_adj_fix) {
622 /* Remap the input vertices for every other primitive. */
623 const struct ac_arg gfx6_vtx_params[6] = {
624 {.used = true, .arg_index = num_sgprs}, {.used = true, .arg_index = num_sgprs + 1},
625 {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
626 {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
627 };
628 const struct ac_arg gfx9_vtx_params[3] = {
629 {.used = true, .arg_index = num_sgprs},
630 {.used = true, .arg_index = num_sgprs + 1},
631 {.used = true, .arg_index = num_sgprs + 4},
632 };
633 LLVMValueRef vtx_in[6], vtx_out[6];
634 LLVMValueRef prim_id, rotate;
635
636 if (ctx->screen->info.chip_class >= GFX9) {
637 for (unsigned i = 0; i < 3; i++) {
638 vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
639 vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
640 }
641 } else {
642 for (unsigned i = 0; i < 6; i++)
643 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
644 }
645
646 prim_id = LLVMGetParam(func, num_sgprs + 2);
647 rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
648
649 for (unsigned i = 0; i < 6; ++i) {
650 LLVMValueRef base, rotated;
651 base = vtx_in[i];
652 rotated = vtx_in[(i + 4) % 6];
653 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
654 }
655
656 if (ctx->screen->info.chip_class >= GFX9) {
657 for (unsigned i = 0; i < 3; i++) {
658 LLVMValueRef hi, out;
659
660 hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
661 out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
662 out = ac_to_float(&ctx->ac, out);
663 ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
664 }
665 } else {
666 for (unsigned i = 0; i < 6; i++) {
667 LLVMValueRef out;
668
669 out = ac_to_float(&ctx->ac, vtx_out[i]);
670 ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
671 }
672 }
673 }
674
675 LLVMBuildRet(builder, ret);
676 }
677
si_llvm_init_gs_callbacks(struct si_shader_context * ctx)678 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
679 {
680 ctx->abi.load_inputs = si_nir_load_input_gs;
681 ctx->abi.emit_vertex = si_llvm_emit_vertex;
682 ctx->abi.emit_primitive = si_llvm_emit_primitive;
683 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
684 }
685