• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 
si_is_es_thread(struct si_shader_context * ctx)30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32    /* Return true if the current thread should execute an ES thread. */
33    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34                         si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
35 }
36 
si_is_gs_thread(struct si_shader_context * ctx)37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38 {
39    /* Return true if the current thread should execute a GS thread. */
40    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41                         si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
42 }
43 
si_llvm_load_input_gs(struct ac_shader_abi * abi,unsigned input_index,unsigned vtx_offset_param,LLVMTypeRef type,unsigned swizzle)44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45                                           unsigned vtx_offset_param, LLVMTypeRef type,
46                                           unsigned swizzle)
47 {
48    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49    struct si_shader *shader = ctx->shader;
50    LLVMValueRef vtx_offset, soffset;
51    struct si_shader_info *info = &shader->selector->info;
52    unsigned param;
53    LLVMValueRef value;
54 
55    param = si_shader_io_get_unique_index(info->input_semantic[input_index], false);
56 
57    /* GFX9 has the ESGS ring in LDS. */
58    if (ctx->screen->info.chip_class >= GFX9) {
59       unsigned index = vtx_offset_param;
60 
61       switch (index / 2) {
62       case 0:
63          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
64          break;
65       case 1:
66          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
67          break;
68       case 2:
69          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
70          break;
71       default:
72          assert(0);
73          return NULL;
74       }
75 
76       unsigned offset = param * 4 + swizzle;
77       vtx_offset =
78          LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
79 
80       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
81       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
82       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
83    }
84 
85    /* GFX6: input load from the ESGS ring in memory. */
86    /* Get the vertex offset parameter on GFX6. */
87    LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
88 
89    vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
90 
91    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
92 
93    value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
94                                 ac_glc, true, false);
95    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
96 }
97 
si_nir_load_input_gs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)98 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
99                                          unsigned driver_location, unsigned component,
100                                          unsigned num_components, unsigned vertex_index,
101                                          LLVMTypeRef type)
102 {
103    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
104 
105    LLVMValueRef value[4];
106    for (unsigned i = component; i < component + num_components; i++) {
107       value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location,
108                                        vertex_index, type, i);
109    }
110 
111    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
112 }
113 
114 /* Pass GS inputs from ES to GS on GFX9. */
si_set_es_return_value_for_gs(struct si_shader_context * ctx)115 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
116 {
117    LLVMValueRef ret = ctx->return_value;
118 
119    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
120    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
121    if (ctx->shader->key.as_ngg)
122       ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
123    else
124       ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
125    ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
126    ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
127 
128    ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
129    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
130                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
131    if (ctx->screen->use_ngg) {
132       ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
133    }
134 
135    unsigned vgpr;
136    if (ctx->stage == MESA_SHADER_VERTEX)
137       vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
138    else
139       vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
140 
141    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
142    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
143    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
144    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
145    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
146    ctx->return_value = ret;
147 }
148 
si_llvm_emit_es_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)149 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
150 {
151    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
152    struct si_shader *es = ctx->shader;
153    struct si_shader_info *info = &es->selector->info;
154    LLVMValueRef lds_base = NULL;
155    unsigned chan;
156    int i;
157 
158    if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
159       unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
160       LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
161       LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
162       vertex_idx =
163          LLVMBuildOr(ctx->ac.builder, vertex_idx,
164                      LLVMBuildMul(ctx->ac.builder, wave_idx,
165                                   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
166                      "");
167       lds_base =
168          LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
169    }
170 
171    for (i = 0; i < info->num_outputs; i++) {
172       int param;
173 
174       if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
175           info->output_semantic[i] == VARYING_SLOT_LAYER)
176          continue;
177 
178       param = si_shader_io_get_unique_index(info->output_semantic[i], false);
179 
180       for (chan = 0; chan < 4; chan++) {
181          if (!(info->output_usagemask[i] & (1 << chan)))
182             continue;
183 
184          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
185          out_val = ac_to_integer(&ctx->ac, out_val);
186 
187          /* GFX9 has the ESGS ring in LDS. */
188          if (ctx->screen->info.chip_class >= GFX9) {
189             LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
190             idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
191             ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
192             continue;
193          }
194 
195          ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
196                                      ac_get_arg(&ctx->ac, ctx->es2gs_offset),
197                                      (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
198       }
199    }
200 
201    if (ctx->screen->info.chip_class >= GFX9)
202       si_set_es_return_value_for_gs(ctx);
203 }
204 
si_get_gs_wave_id(struct si_shader_context * ctx)205 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
206 {
207    if (ctx->screen->info.chip_class >= GFX9)
208       return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
209    else
210       return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
211 }
212 
emit_gs_epilogue(struct si_shader_context * ctx)213 static void emit_gs_epilogue(struct si_shader_context *ctx)
214 {
215    if (ctx->shader->key.as_ngg) {
216       gfx10_ngg_gs_emit_epilogue(ctx);
217       return;
218    }
219 
220    if (ctx->screen->info.chip_class >= GFX10)
221       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
222 
223    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
224 
225    if (ctx->screen->info.chip_class >= GFX9)
226       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
227 }
228 
si_llvm_emit_gs_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)229 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
230                                      LLVMValueRef *addrs)
231 {
232    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
233    struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
234 
235    assert(info->num_outputs <= max_outputs);
236 
237    emit_gs_epilogue(ctx);
238 }
239 
240 /* Emit one vertex from the geometry shader */
si_llvm_emit_vertex(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef * addrs)241 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
242 {
243    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
244 
245    if (ctx->shader->key.as_ngg) {
246       gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
247       return;
248    }
249 
250    struct si_shader_info *info = &ctx->shader->selector->info;
251    struct si_shader *shader = ctx->shader;
252    LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
253    LLVMValueRef gs_next_vertex;
254    LLVMValueRef can_emit;
255    unsigned chan, offset;
256    int i;
257 
258    /* Write vertex attribute values to GSVS ring */
259    gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
260 
261    /* If this thread has already emitted the declared maximum number of
262     * vertices, skip the write: excessive vertex emissions are not
263     * supposed to have any effect.
264     *
265     * If the shader has no writes to memory, kill it instead. This skips
266     * further memory loads and may allow LLVM to skip to the end
267     * altogether.
268     */
269    can_emit =
270       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
271                     LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
272 
273    bool use_kill = !info->base.writes_memory;
274    if (use_kill) {
275       ac_build_kill_if_false(&ctx->ac, can_emit);
276    } else {
277       ac_build_ifcc(&ctx->ac, can_emit, 6505);
278    }
279 
280    offset = 0;
281    for (i = 0; i < info->num_outputs; i++) {
282       for (chan = 0; chan < 4; chan++) {
283          if (!(info->output_usagemask[i] & (1 << chan)) ||
284              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
285             continue;
286 
287          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
288          LLVMValueRef voffset =
289             LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
290          offset++;
291 
292          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
293          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
294 
295          out_val = ac_to_integer(&ctx->ac, out_val);
296 
297          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
298                                      0, ac_glc | ac_slc | ac_swizzled);
299       }
300    }
301 
302    gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
303    LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
304 
305    /* Signal vertex emission if vertex data was written. */
306    if (offset) {
307       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
308                        si_get_gs_wave_id(ctx));
309    }
310 
311    if (!use_kill)
312       ac_build_endif(&ctx->ac, 6505);
313 }
314 
315 /* Cut one primitive from the geometry shader */
si_llvm_emit_primitive(struct ac_shader_abi * abi,unsigned stream)316 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
317 {
318    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
319 
320    if (ctx->shader->key.as_ngg) {
321       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
322       return;
323    }
324 
325    /* Signal primitive cut */
326    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
327                     si_get_gs_wave_id(ctx));
328 }
329 
si_preload_esgs_ring(struct si_shader_context * ctx)330 void si_preload_esgs_ring(struct si_shader_context *ctx)
331 {
332    if (ctx->screen->info.chip_class <= GFX8) {
333       unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
334       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
335       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
336 
337       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
338    } else {
339       if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
340          /* Declare the ESGS ring as an explicit LDS symbol. */
341          si_llvm_declare_esgs_ring(ctx);
342       } else {
343          ac_declare_lds_as_pointer(&ctx->ac);
344          ctx->esgs_ring = ctx->ac.lds;
345       }
346    }
347 }
348 
si_preload_gs_rings(struct si_shader_context * ctx)349 void si_preload_gs_rings(struct si_shader_context *ctx)
350 {
351    const struct si_shader_selector *sel = ctx->shader->selector;
352    LLVMBuilderRef builder = ctx->ac.builder;
353    LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
354    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
355    LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
356 
357    /* The conceptual layout of the GSVS ring is
358     *   v0c0 .. vLv0 v0c1 .. vLc1 ..
359     * but the real memory layout is swizzled across
360     * threads:
361     *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
362     *   t16v0c0 ..
363     * Override the buffer descriptor accordingly.
364     */
365    LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
366    uint64_t stream_offset = 0;
367 
368    for (unsigned stream = 0; stream < 4; ++stream) {
369       unsigned num_components;
370       unsigned stride;
371       unsigned num_records;
372       LLVMValueRef ring, tmp;
373 
374       num_components = sel->info.num_stream_output_components[stream];
375       if (!num_components)
376          continue;
377 
378       stride = 4 * num_components * sel->info.base.gs.vertices_out;
379 
380       /* Limit on the stride field for <= GFX7. */
381       assert(stride < (1 << 14));
382 
383       num_records = ctx->ac.wave_size;
384 
385       ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
386       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
387       tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
388       stream_offset += stride * ctx->ac.wave_size;
389 
390       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
391       ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
392       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
393       tmp = LLVMBuildOr(
394          builder, tmp,
395          LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
396       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
397       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
398                                     LLVMConstInt(ctx->ac.i32, 2, 0), "");
399 
400       uint32_t rsrc3 =
401          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
402          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
403          S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
404          S_008F0C_ADD_TID_ENABLE(1);
405 
406       if (ctx->ac.chip_class >= GFX10) {
407          rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
408                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
409       } else {
410          rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
411                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
412                   S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
413       }
414 
415       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
416                                     LLVMConstInt(ctx->ac.i32, 3, 0), "");
417 
418       ctx->gsvs_ring[stream] = ring;
419    }
420 }
421 
422 /* Generate code for the hardware VS shader stage to go with a geometry shader */
si_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader_selector * gs_selector,struct pipe_debug_callback * debug)423 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
424                                              struct ac_llvm_compiler *compiler,
425                                              struct si_shader_selector *gs_selector,
426                                              struct pipe_debug_callback *debug)
427 {
428    struct si_shader_context ctx;
429    struct si_shader *shader;
430    LLVMBuilderRef builder;
431    struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
432    struct si_shader_info *gsinfo = &gs_selector->info;
433    int i;
434 
435    shader = CALLOC_STRUCT(si_shader);
436    if (!shader)
437       return NULL;
438 
439    /* We can leave the fence as permanently signaled because the GS copy
440     * shader only becomes visible globally after it has been compiled. */
441    util_queue_fence_init(&shader->ready);
442 
443    shader->selector = gs_selector;
444    shader->is_gs_copy_shader = true;
445 
446    si_llvm_context_init(&ctx, sscreen, compiler,
447                         si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
448                                          false, false, false, false));
449    ctx.shader = shader;
450    ctx.stage = MESA_SHADER_VERTEX;
451 
452    builder = ctx.ac.builder;
453 
454    si_create_function(&ctx, false);
455 
456    LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
457    ctx.gsvs_ring[0] =
458       ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
459 
460    LLVMValueRef voffset =
461       LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
462 
463    /* Fetch the vertex stream ID.*/
464    LLVMValueRef stream_id;
465 
466    if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
467       stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
468    else
469       stream_id = ctx.ac.i32_0;
470 
471    /* Fill in output information. */
472    for (i = 0; i < gsinfo->num_outputs; ++i) {
473       outputs[i].semantic = gsinfo->output_semantic[i];
474 
475       for (int chan = 0; chan < 4; chan++) {
476          outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
477       }
478    }
479 
480    LLVMBasicBlockRef end_bb;
481    LLVMValueRef switch_inst;
482 
483    end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
484    switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
485 
486    for (int stream = 0; stream < 4; stream++) {
487       LLVMBasicBlockRef bb;
488       unsigned offset;
489 
490       if (!gsinfo->num_stream_output_components[stream])
491          continue;
492 
493       if (stream > 0 && !gs_selector->so.num_outputs)
494          continue;
495 
496       bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
497       LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
498       LLVMPositionBuilderAtEnd(builder, bb);
499 
500       /* Fetch vertex data from GSVS ring */
501       offset = 0;
502       for (i = 0; i < gsinfo->num_outputs; ++i) {
503          for (unsigned chan = 0; chan < 4; chan++) {
504             if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
505                 outputs[i].vertex_stream[chan] != stream) {
506                outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
507                continue;
508             }
509 
510             LLVMValueRef soffset =
511                LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
512             offset++;
513 
514             outputs[i].values[chan] =
515                ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
516                                     ac_glc | ac_slc, true, false);
517          }
518       }
519 
520       /* Streamout and exports. */
521       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
522          si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
523       }
524 
525       if (stream == 0)
526          si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
527 
528       LLVMBuildBr(builder, end_bb);
529    }
530 
531    LLVMPositionBuilderAtEnd(builder, end_bb);
532 
533    LLVMBuildRetVoid(ctx.ac.builder);
534 
535    ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
536    si_llvm_optimize_module(&ctx);
537 
538    bool ok = false;
539    if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
540                        debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
541       if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
542          fprintf(stderr, "GS Copy Shader:\n");
543       si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
544 
545       if (!ctx.shader->config.scratch_bytes_per_wave)
546          ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
547       else
548          ok = true;
549    }
550 
551    si_llvm_dispose(&ctx);
552 
553    if (!ok) {
554       FREE(shader);
555       shader = NULL;
556    } else {
557       si_fix_resource_usage(sscreen, shader);
558    }
559    return shader;
560 }
561 
562 /**
563  * Build the GS prolog function. Rotate the input vertices for triangle strips
564  * with adjacency.
565  */
si_llvm_build_gs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)566 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
567 {
568    unsigned num_sgprs, num_vgprs;
569    LLVMBuilderRef builder = ctx->ac.builder;
570    LLVMTypeRef returns[AC_MAX_ARGS];
571    LLVMValueRef func, ret;
572 
573    memset(&ctx->args, 0, sizeof(ctx->args));
574 
575    if (ctx->screen->info.chip_class >= GFX9) {
576       if (key->gs_prolog.states.gfx9_prev_is_vs)
577          num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
578       else
579          num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
580       num_vgprs = 5; /* ES inputs are not needed by GS */
581    } else {
582       num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
583       num_vgprs = 8;
584    }
585 
586    for (unsigned i = 0; i < num_sgprs; ++i) {
587       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
588       returns[i] = ctx->ac.i32;
589    }
590 
591    for (unsigned i = 0; i < num_vgprs; ++i) {
592       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
593       returns[num_sgprs + i] = ctx->ac.f32;
594    }
595 
596    /* Create the function. */
597    si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
598    func = ctx->main_fn;
599 
600    /* Set the full EXEC mask for the prolog, because we are only fiddling
601     * with registers here. The main shader part will set the correct EXEC
602     * mask.
603     */
604    if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
605       ac_init_exec_full_mask(&ctx->ac);
606 
607    /* Copy inputs to outputs. This should be no-op, as the registers match,
608     * but it will prevent the compiler from overwriting them unintentionally.
609     */
610    ret = ctx->return_value;
611    for (unsigned i = 0; i < num_sgprs; i++) {
612       LLVMValueRef p = LLVMGetParam(func, i);
613       ret = LLVMBuildInsertValue(builder, ret, p, i, "");
614    }
615    for (unsigned i = 0; i < num_vgprs; i++) {
616       LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
617       p = ac_to_float(&ctx->ac, p);
618       ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
619    }
620 
621    if (key->gs_prolog.states.tri_strip_adj_fix) {
622       /* Remap the input vertices for every other primitive. */
623       const struct ac_arg gfx6_vtx_params[6] = {
624          {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
625          {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
626          {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
627       };
628       const struct ac_arg gfx9_vtx_params[3] = {
629          {.used = true, .arg_index = num_sgprs},
630          {.used = true, .arg_index = num_sgprs + 1},
631          {.used = true, .arg_index = num_sgprs + 4},
632       };
633       LLVMValueRef vtx_in[6], vtx_out[6];
634       LLVMValueRef prim_id, rotate;
635 
636       if (ctx->screen->info.chip_class >= GFX9) {
637          for (unsigned i = 0; i < 3; i++) {
638             vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
639             vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
640          }
641       } else {
642          for (unsigned i = 0; i < 6; i++)
643             vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
644       }
645 
646       prim_id = LLVMGetParam(func, num_sgprs + 2);
647       rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
648 
649       for (unsigned i = 0; i < 6; ++i) {
650          LLVMValueRef base, rotated;
651          base = vtx_in[i];
652          rotated = vtx_in[(i + 4) % 6];
653          vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
654       }
655 
656       if (ctx->screen->info.chip_class >= GFX9) {
657          for (unsigned i = 0; i < 3; i++) {
658             LLVMValueRef hi, out;
659 
660             hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
661             out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
662             out = ac_to_float(&ctx->ac, out);
663             ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
664          }
665       } else {
666          for (unsigned i = 0; i < 6; i++) {
667             LLVMValueRef out;
668 
669             out = ac_to_float(&ctx->ac, vtx_out[i]);
670             ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
671          }
672       }
673    }
674 
675    LLVMBuildRet(builder, ret);
676 }
677 
si_llvm_init_gs_callbacks(struct si_shader_context * ctx)678 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
679 {
680    ctx->abi.load_inputs = si_nir_load_input_gs;
681    ctx->abi.emit_vertex = si_llvm_emit_vertex;
682    ctx->abi.emit_primitive = si_llvm_emit_primitive;
683    ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
684 }
685