• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 
unpack_sint16(struct si_shader_context * ctx,LLVMValueRef i32,unsigned index)30 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
31 {
32    assert(index <= 1);
33 
34    if (index == 1)
35       return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
36 
37    return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
38                         ctx->ac.i32, "");
39 }
40 
load_input_vs(struct si_shader_context * ctx,unsigned input_index,LLVMValueRef out[4])41 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
42 {
43    const struct si_shader_info *info = &ctx->shader->selector->info;
44    unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;
45 
46    if (vs_blit_property) {
47       LLVMValueRef vertex_id = ctx->abi.vertex_id;
48       LLVMValueRef sel_x1 =
49          LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
50       /* Use LLVMIntNE, because we have 3 vertices and only
51        * the middle one should use y2.
52        */
53       LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
54 
55       unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
56       if (input_index == 0) {
57          /* Position: */
58          LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
59          LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
60 
61          LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
62          LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
63          LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
64          LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
65 
66          LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
67          LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
68 
69          out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
70          out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
71          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
72          out[3] = ctx->ac.f32_1;
73          return;
74       }
75 
76       /* Color or texture coordinates: */
77       assert(input_index == 1);
78 
79       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
80          for (int i = 0; i < 4; i++) {
81             out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
82          }
83       } else {
84          assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
85          LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
86          LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
87          LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
88          LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
89 
90          out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
91          out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
92          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
93          out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
94       }
95       return;
96    }
97 
98    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
99    union si_vs_fix_fetch fix_fetch;
100    LLVMValueRef vb_desc;
101    LLVMValueRef vertex_index;
102    LLVMValueRef tmp;
103 
104    if (input_index < num_vbos_in_user_sgprs) {
105       vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
106    } else {
107       unsigned index = input_index - num_vbos_in_user_sgprs;
108       vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers),
109                                       LLVMConstInt(ctx->ac.i32, index, 0));
110    }
111 
112    vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
113 
114    /* Use the open-coded implementation for all loads of doubles and
115     * of dword-sized data that needs fixups. We need to insert conversion
116     * code anyway, and the amd/common code does it for us.
117     *
118     * Note: On LLVM <= 8, we can only open-code formats with
119     * channel size >= 4 bytes.
120     */
121    bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
122    fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
123    if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
124        (fix_fetch.u.log_size == 2)) {
125       tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
126                                            fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
127                                            fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
128                                            ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
129       for (unsigned i = 0; i < 4; ++i)
130          out[i] =
131             LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
132       return;
133    }
134 
135    unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
136    if (required_channels == 0) {
137       for (unsigned i = 0; i < 4; ++i)
138          out[i] = LLVMGetUndef(ctx->ac.f32);
139       return;
140    }
141 
142    /* Do multiple loads for special formats. */
143    LLVMValueRef fetches[4];
144    unsigned num_fetches;
145    unsigned fetch_stride;
146    unsigned channels_per_fetch;
147 
148    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
149       num_fetches = MIN2(required_channels, 3);
150       fetch_stride = 1 << fix_fetch.u.log_size;
151       channels_per_fetch = 1;
152    } else {
153       num_fetches = 1;
154       fetch_stride = 0;
155       channels_per_fetch = required_channels;
156    }
157 
158    for (unsigned i = 0; i < num_fetches; ++i) {
159       LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
160       fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
161                                                channels_per_fetch, 0, true, false);
162    }
163 
164    if (num_fetches == 1 && channels_per_fetch > 1) {
165       LLVMValueRef fetch = fetches[0];
166       for (unsigned i = 0; i < channels_per_fetch; ++i) {
167          tmp = LLVMConstInt(ctx->ac.i32, i, false);
168          fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
169       }
170       num_fetches = channels_per_fetch;
171       channels_per_fetch = 1;
172    }
173 
174    for (unsigned i = num_fetches; i < 4; ++i)
175       fetches[i] = LLVMGetUndef(ctx->ac.f32);
176 
177    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
178       if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
179          fetches[3] = ctx->ac.i32_1;
180       else
181          fetches[3] = ctx->ac.f32_1;
182    } else if (fix_fetch.u.log_size == 3 &&
183               (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
184                fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
185                fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
186               required_channels == 4) {
187       /* For 2_10_10_10, the hardware returns an unsigned value;
188        * convert it to a signed one.
189        */
190       LLVMValueRef tmp = fetches[3];
191       LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
192 
193       /* First, recover the sign-extended signed integer value. */
194       if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
195          tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
196       else
197          tmp = ac_to_integer(&ctx->ac, tmp);
198 
199       /* For the integer-like cases, do a natural sign extension.
200        *
201        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
202        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
203        * exponent.
204        */
205       tmp = LLVMBuildShl(
206          ctx->ac.builder, tmp,
207          fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
208       tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
209 
210       /* Convert back to the right type. */
211       if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
212          LLVMValueRef clamp;
213          LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
214          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
215          clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
216          tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
217       } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
218          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
219       }
220 
221       fetches[3] = tmp;
222    }
223 
224    for (unsigned i = 0; i < 4; ++i)
225       out[i] = ac_to_float(&ctx->ac, fetches[i]);
226 }
227 
si_llvm_load_vs_inputs(struct si_shader_context * ctx,struct nir_shader * nir)228 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
229 {
230    const struct si_shader_info *info = &ctx->shader->selector->info;
231 
232    for (unsigned i = 0; i < info->num_inputs; i++) {
233       LLVMValueRef values[4];
234 
235       load_input_vs(ctx, i, values);
236 
237       for (unsigned chan = 0; chan < 4; chan++) {
238          ctx->inputs[i * 4 + chan] =
239             LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
240       }
241    }
242 }
243 
si_llvm_streamout_store_output(struct si_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,struct pipe_stream_output * stream_out,struct si_shader_output_values * shader_out)244 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
245                                     LLVMValueRef const *so_write_offsets,
246                                     struct pipe_stream_output *stream_out,
247                                     struct si_shader_output_values *shader_out)
248 {
249    unsigned buf_idx = stream_out->output_buffer;
250    unsigned start = stream_out->start_component;
251    unsigned num_comps = stream_out->num_components;
252    LLVMValueRef out[4];
253 
254    assert(num_comps && num_comps <= 4);
255    if (!num_comps || num_comps > 4)
256       return;
257 
258    /* Load the output as int. */
259    for (int j = 0; j < num_comps; j++) {
260       assert(stream_out->stream == shader_out->vertex_stream[start + j]);
261 
262       out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
263    }
264 
265    /* Pack the output. */
266    LLVMValueRef vdata = NULL;
267 
268    switch (num_comps) {
269    case 1: /* as i32 */
270       vdata = out[0];
271       break;
272    case 2: /* as v2i32 */
273    case 3: /* as v3i32 */
274       if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
275          vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
276          break;
277       }
278       /* as v4i32 (aligned to 4) */
279       out[3] = LLVMGetUndef(ctx->ac.i32);
280       /* fall through */
281    case 4: /* as v4i32 */
282       vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
283       break;
284    }
285 
286    ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
287                                so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
288                                ac_glc | ac_slc);
289 }
290 
291 /**
292  * Write streamout data to buffers for vertex stream @p stream (different
293  * vertex streams can occur for GS copy shaders).
294  */
si_llvm_emit_streamout(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput,unsigned stream)295 void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
296                             unsigned noutput, unsigned stream)
297 {
298    struct si_shader_selector *sel = ctx->shader->selector;
299    struct pipe_stream_output_info *so = &sel->so;
300    LLVMBuilderRef builder = ctx->ac.builder;
301    int i;
302 
303    /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
304    LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7);
305 
306    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
307 
308    /* can_emit = tid < so_vtx_count; */
309    LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
310 
311    /* Emit the streamout code conditionally. This actually avoids
312     * out-of-bounds buffer access. The hw tells us via the SGPR
313     * (so_vtx_count) which threads are allowed to emit streamout data. */
314    ac_build_ifcc(&ctx->ac, can_emit, 6501);
315    {
316       /* The buffer offset is computed as follows:
317        *   ByteOffset = streamout_offset[buffer_id]*4 +
318        *                (streamout_write_index + thread_id)*stride[buffer_id] +
319        *                attrib_offset
320        */
321 
322       LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index);
323 
324       /* Compute (streamout_write_index + thread_id). */
325       so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
326 
327       /* Load the descriptor and compute the write offset for each
328        * enabled buffer. */
329       LLVMValueRef so_write_offset[4] = {};
330       LLVMValueRef so_buffers[4];
331       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
332 
333       for (i = 0; i < 4; i++) {
334          if (!so->stride[i])
335             continue;
336 
337          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
338 
339          so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
340 
341          LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]);
342          so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
343 
344          so_write_offset[i] = ac_build_imad(
345             &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
346       }
347 
348       /* Write streamout data. */
349       for (i = 0; i < so->num_outputs; i++) {
350          unsigned reg = so->output[i].register_index;
351 
352          if (reg >= noutput)
353             continue;
354 
355          if (stream != so->output[i].stream)
356             continue;
357 
358          si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
359                                         &outputs[reg]);
360       }
361    }
362    ac_build_endif(&ctx->ac, 6501);
363 }
364 
si_llvm_emit_clipvertex(struct si_shader_context * ctx,struct ac_export_args * pos,LLVMValueRef * out_elts)365 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
366                                     LLVMValueRef *out_elts)
367 {
368    unsigned reg_index;
369    unsigned chan;
370    unsigned const_chan;
371    LLVMValueRef base_elt;
372    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
373    LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
374    LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
375    unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
376                             ~ctx->shader->key.opt.kill_clip_distances;
377 
378    for (reg_index = 0; reg_index < 2; reg_index++) {
379       struct ac_export_args *args = &pos[2 + reg_index];
380 
381       if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
382          continue;
383 
384       args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);
385 
386       /* Compute dot products of position and user clip plane vectors */
387       for (chan = 0; chan < 4; chan++) {
388          if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
389             continue;
390 
391          for (const_chan = 0; const_chan < 4; const_chan++) {
392             LLVMValueRef addr =
393                LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
394             base_elt = si_buffer_load_const(ctx, const_resource, addr);
395             args->out[chan] =
396                ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan],
397                              const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
398          }
399       }
400 
401       args->enabled_channels = 0xf;
402       args->valid_mask = 0;
403       args->done = 0;
404       args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
405       args->compr = 0;
406    }
407 }
408 
409 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_vs_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned target,struct ac_export_args * args)410 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
411                                         unsigned target, struct ac_export_args *args)
412 {
413    args->enabled_channels = 0xf; /* writemask - default is 0xf */
414    args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
415    args->done = 0;               /* Specify whether this is the last export */
416    args->target = target;        /* Specify the target we are exporting */
417    args->compr = false;
418 
419    memcpy(&args->out[0], values, sizeof(values[0]) * 4);
420 }
421 
si_export_param(struct si_shader_context * ctx,unsigned index,LLVMValueRef * values)422 static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values)
423 {
424    struct ac_export_args args;
425 
426    si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args);
427    ac_build_export(&ctx->ac, &args);
428 }
429 
si_build_param_exports(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)430 static void si_build_param_exports(struct si_shader_context *ctx,
431                                    struct si_shader_output_values *outputs, unsigned noutput)
432 {
433    struct si_shader *shader = ctx->shader;
434    unsigned param_count = 0;
435 
436    for (unsigned i = 0; i < noutput; i++) {
437       unsigned semantic = outputs[i].semantic;
438 
439       if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
440           outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
441          continue;
442 
443       switch (semantic) {
444       case VARYING_SLOT_LAYER:
445       case VARYING_SLOT_VIEWPORT:
446       case VARYING_SLOT_CLIP_DIST0:
447       case VARYING_SLOT_CLIP_DIST1:
448       case VARYING_SLOT_COL0:
449       case VARYING_SLOT_COL1:
450       case VARYING_SLOT_BFC0:
451       case VARYING_SLOT_BFC1:
452       case VARYING_SLOT_PRIMITIVE_ID:
453       case VARYING_SLOT_FOGC:
454          break;
455       default:
456          if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||
457              semantic >= VARYING_SLOT_VAR0)
458             break;
459          else
460             continue;
461       }
462 
463       if (semantic < VARYING_SLOT_VAR0 + SI_MAX_IO_GENERIC &&
464           shader->key.opt.kill_outputs &
465              (1ull << si_shader_io_get_unique_index(semantic, true)))
466          continue;
467 
468       si_export_param(ctx, param_count, outputs[i].values);
469 
470       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
471       shader->info.vs_output_param_offset[i] = param_count++;
472    }
473 
474    shader->info.nr_param_exports = param_count;
475 }
476 
477 /**
478  * Vertex color clamping.
479  *
480  * This uses a state constant loaded in a user data SGPR and
481  * an IF statement is added that clamps all colors if the constant
482  * is true.
483  */
si_vertex_color_clamping(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)484 static void si_vertex_color_clamping(struct si_shader_context *ctx,
485                                      struct si_shader_output_values *outputs, unsigned noutput)
486 {
487    LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
488    bool has_colors = false;
489 
490    /* Store original colors to alloca variables. */
491    for (unsigned i = 0; i < noutput; i++) {
492       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
493           outputs[i].semantic != VARYING_SLOT_COL1 &&
494           outputs[i].semantic != VARYING_SLOT_BFC0 &&
495           outputs[i].semantic != VARYING_SLOT_BFC1)
496          continue;
497 
498       for (unsigned j = 0; j < 4; j++) {
499          addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
500          LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
501       }
502       has_colors = true;
503    }
504 
505    if (!has_colors)
506       return;
507 
508    /* The state is in the first bit of the user SGPR. */
509    LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
510    cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
511 
512    ac_build_ifcc(&ctx->ac, cond, 6502);
513 
514    /* Store clamped colors to alloca variables within the conditional block. */
515    for (unsigned i = 0; i < noutput; i++) {
516       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
517           outputs[i].semantic != VARYING_SLOT_COL1 &&
518           outputs[i].semantic != VARYING_SLOT_BFC0 &&
519           outputs[i].semantic != VARYING_SLOT_BFC1)
520          continue;
521 
522       for (unsigned j = 0; j < 4; j++) {
523          LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
524                         addr[i][j]);
525       }
526    }
527    ac_build_endif(&ctx->ac, 6502);
528 
529    /* Load clamped colors */
530    for (unsigned i = 0; i < noutput; i++) {
531       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
532           outputs[i].semantic != VARYING_SLOT_COL1 &&
533           outputs[i].semantic != VARYING_SLOT_BFC0 &&
534           outputs[i].semantic != VARYING_SLOT_BFC1)
535          continue;
536 
537       for (unsigned j = 0; j < 4; j++) {
538          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
539       }
540    }
541 }
542 
543 /* Generate export instructions for hardware VS shader stage or NGG GS stage
544  * (position and parameter data only).
545  */
si_llvm_build_vs_exports(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)546 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
547                               struct si_shader_output_values *outputs, unsigned noutput)
548 {
549    struct si_shader *shader = ctx->shader;
550    struct ac_export_args pos_args[4] = {};
551    LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
552                 viewport_index_value = NULL;
553    unsigned pos_idx, index;
554    unsigned clipdist_mask = (shader->selector->clipdist_mask &
555                              ~shader->key.opt.kill_clip_distances) |
556                             shader->selector->culldist_mask;
557    int i;
558 
559    si_vertex_color_clamping(ctx, outputs, noutput);
560 
561    /* Build position exports. */
562    for (i = 0; i < noutput; i++) {
563       switch (outputs[i].semantic) {
564       case VARYING_SLOT_POS:
565          si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
566          break;
567       case VARYING_SLOT_PSIZ:
568          psize_value = outputs[i].values[0];
569          break;
570       case VARYING_SLOT_LAYER:
571          layer_value = outputs[i].values[0];
572          break;
573       case VARYING_SLOT_VIEWPORT:
574          viewport_index_value = outputs[i].values[0];
575          break;
576       case VARYING_SLOT_EDGE:
577          edgeflag_value = outputs[i].values[0];
578          break;
579       case VARYING_SLOT_CLIP_DIST0:
580       case VARYING_SLOT_CLIP_DIST1:
581          index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
582          if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
583             si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
584                                         &pos_args[2 + index]);
585          }
586          break;
587       case VARYING_SLOT_CLIP_VERTEX:
588          si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
589          break;
590       }
591    }
592 
593    /* We need to add the position output manually if it's missing. */
594    if (!pos_args[0].out[0]) {
595       pos_args[0].enabled_channels = 0xf; /* writemask */
596       pos_args[0].valid_mask = 0;         /* EXEC mask */
597       pos_args[0].done = 0;               /* last export? */
598       pos_args[0].target = V_008DFC_SQ_EXP_POS;
599       pos_args[0].compr = 0;              /* COMPR flag */
600       pos_args[0].out[0] = ctx->ac.f32_0; /* X */
601       pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
602       pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
603       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
604    }
605 
606    bool writes_psize = shader->selector->info.writes_psize && !shader->key.opt.kill_pointsize;
607    bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
608 
609    /* Write the misc vector (point size, edgeflag, layer, viewport). */
610    if (writes_psize || pos_writes_edgeflag ||
611        shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
612       pos_args[1].enabled_channels = writes_psize |
613                                      (pos_writes_edgeflag << 1) |
614                                      (shader->selector->info.writes_layer << 2);
615 
616       pos_args[1].valid_mask = 0; /* EXEC mask */
617       pos_args[1].done = 0;       /* last export? */
618       pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
619       pos_args[1].compr = 0;              /* COMPR flag */
620       pos_args[1].out[0] = ctx->ac.f32_0; /* X */
621       pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
622       pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
623       pos_args[1].out[3] = ctx->ac.f32_0; /* W */
624 
625       if (writes_psize)
626          pos_args[1].out[0] = psize_value;
627 
628       if (pos_writes_edgeflag) {
629          /* The output is a float, but the hw expects an integer
630           * with the first bit containing the edge flag. */
631          edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
632          edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
633 
634          /* The LLVM intrinsic expects a float. */
635          pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
636       }
637 
638       if (ctx->screen->info.chip_class >= GFX9) {
639          /* GFX9 has the layer in out.z[10:0] and the viewport
640           * index in out.z[19:16].
641           */
642          if (shader->selector->info.writes_layer)
643             pos_args[1].out[2] = layer_value;
644 
645          if (shader->selector->info.writes_viewport_index) {
646             LLVMValueRef v = viewport_index_value;
647 
648             v = ac_to_integer(&ctx->ac, v);
649             v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
650             v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
651             pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
652             pos_args[1].enabled_channels |= 1 << 2;
653          }
654       } else {
655          if (shader->selector->info.writes_layer)
656             pos_args[1].out[2] = layer_value;
657 
658          if (shader->selector->info.writes_viewport_index) {
659             pos_args[1].out[3] = viewport_index_value;
660             pos_args[1].enabled_channels |= 1 << 3;
661          }
662       }
663    }
664 
665    for (i = 0; i < 4; i++)
666       if (pos_args[i].out[0])
667          shader->info.nr_pos_exports++;
668 
669    /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
670     * Setting valid_mask=1 prevents it and has no other effect.
671     */
672    if (ctx->screen->info.chip_class == GFX10)
673       pos_args[0].valid_mask = 1;
674 
675    pos_idx = 0;
676    for (i = 0; i < 4; i++) {
677       if (!pos_args[i].out[0])
678          continue;
679 
680       /* Specify the target we are exporting */
681       pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
682 
683       if (pos_idx == shader->info.nr_pos_exports)
684          /* Specify that this is the last export */
685          pos_args[i].done = 1;
686 
687       ac_build_export(&ctx->ac, &pos_args[i]);
688    }
689 
690    /* Build parameter exports. */
691    si_build_param_exports(ctx, outputs, noutput);
692 }
693 
si_llvm_emit_vs_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)694 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
695 {
696    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
697    struct si_shader_info *info = &ctx->shader->selector->info;
698    struct si_shader_output_values *outputs = NULL;
699    int i, j;
700 
701    assert(!ctx->shader->is_gs_copy_shader);
702    assert(info->num_outputs <= max_outputs);
703 
704    outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
705 
706    for (i = 0; i < info->num_outputs; i++) {
707       outputs[i].semantic = info->output_semantic[i];
708 
709       for (j = 0; j < 4; j++) {
710          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
711          outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
712       }
713    }
714 
715    if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
716       si_llvm_emit_streamout(ctx, outputs, i, 0);
717 
718    /* Export PrimitiveID. */
719    if (ctx->shader->key.mono.u.vs_export_prim_id) {
720       outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
721       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
722       for (j = 1; j < 4; j++)
723          outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
724 
725       memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
726       i++;
727    }
728 
729    si_llvm_build_vs_exports(ctx, outputs, i);
730    FREE(outputs);
731 }
732 
si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)733 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
734                                                   LLVMValueRef *addrs)
735 {
736    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
737    struct si_shader_info *info = &ctx->shader->selector->info;
738    LLVMValueRef pos[4] = {};
739 
740    assert(info->num_outputs <= max_outputs);
741 
742    for (unsigned i = 0; i < info->num_outputs; i++) {
743       if (info->output_semantic[i] != VARYING_SLOT_POS)
744          continue;
745 
746       for (unsigned chan = 0; chan < 4; chan++)
747          pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
748       break;
749    }
750    assert(pos[0] != NULL);
751 
752    /* Return the position output. */
753    LLVMValueRef ret = ctx->return_value;
754    for (unsigned chan = 0; chan < 4; chan++)
755       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
756    ctx->return_value = ret;
757 }
758 
759 /**
760  * Build the vertex shader prolog function.
761  *
762  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
763  * All inputs are returned unmodified. The vertex load indices are
764  * stored after them, which will be used by the API VS for fetching inputs.
765  *
766  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
767  *   input_v0,
768  *   input_v1,
769  *   input_v2,
770  *   input_v3,
771  *   (VertexID + BaseVertex),
772  *   (InstanceID + StartInstance),
773  *   (InstanceID / 2 + StartInstance)
774  */
si_llvm_build_vs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)775 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
776 {
777    LLVMTypeRef *returns;
778    LLVMValueRef ret, func;
779    int num_returns, i;
780    unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
781    unsigned num_input_vgprs =
782       key->vs_prolog.num_merged_next_stage_vgprs + 4;
783    struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
784    struct ac_arg input_vgpr_param[10];
785    LLVMValueRef input_vgprs[10];
786    unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
787    unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
788 
789    memset(&ctx->args, 0, sizeof(ctx->args));
790 
791    /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
792    returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
793    num_returns = 0;
794 
795    /* Declare input and output SGPRs. */
796    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
797       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
798       returns[num_returns++] = ctx->ac.i32;
799    }
800 
801    struct ac_arg merged_wave_info = input_sgpr_param[3];
802 
803    /* Preloaded VGPRs (outputs must be floats) */
804    for (i = 0; i < num_input_vgprs; i++) {
805       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
806       returns[num_returns++] = ctx->ac.f32;
807    }
808 
809    /* Vertex load indices. */
810    for (i = 0; i < key->vs_prolog.num_inputs; i++)
811       returns[num_returns++] = ctx->ac.f32;
812 
813    /* Create the function. */
814    si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
815    func = ctx->main_fn;
816 
817    for (i = 0; i < num_input_vgprs; i++) {
818       input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
819    }
820 
821    if (key->vs_prolog.num_merged_next_stage_vgprs) {
822       if (!key->vs_prolog.is_monolithic)
823          si_init_exec_from_input(ctx, merged_wave_info, 0);
824 
825       if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
826          /* If there are no HS threads, SPI loads the LS VGPRs
827           * starting at VGPR 0. Shift them back to where they
828           * belong.
829           */
830          LLVMValueRef has_hs_threads =
831             LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
832                           si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
833 
834          for (i = 4; i > 0; --i) {
835             input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
836                                                  input_vgprs[i + 1], input_vgprs[i - 1], "");
837          }
838       }
839    }
840 
841    if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
842       LLVMValueRef wave_id, thread_id_in_tg;
843 
844       wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
845       thread_id_in_tg =
846          ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
847                        ac_get_thread_id(&ctx->ac));
848 
849       /* The GS fast launch initializes all VGPRs to the value of
850        * the first thread, so we have to add the thread ID.
851        *
852        * Only these are initialized by the hw:
853        *   VGPR2: Base Primitive ID
854        *   VGPR5: Base Vertex ID
855        *   VGPR6: Instance ID
856        */
857 
858       /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
859        * The NGG cull shader will read them from there.
860        */
861       if (key->vs_prolog.gs_fast_launch_tri_list) {
862          input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
863                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
864                                         LLVMConstInt(ctx->ac.i32, 0, 0));
865          input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
866                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
867                                         LLVMConstInt(ctx->ac.i32, 1, 0));
868          input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
869                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
870                                         LLVMConstInt(ctx->ac.i32, 2, 0));
871       } else {
872          assert(key->vs_prolog.gs_fast_launch_tri_strip);
873          LLVMBuilderRef builder = ctx->ac.builder;
874          /* Triangle indices: */
875          LLVMValueRef index[3] = {
876             thread_id_in_tg,
877             LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
878             LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
879          };
880          LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
881          LLVMValueRef flatshade_first = LLVMBuildICmp(
882             builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
883 
884          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
885          input_vgprs[0] = index[0];
886          input_vgprs[1] = index[1];
887          input_vgprs[4] = index[2];
888       }
889 
890       /* Triangles always have all edge flags set initially. */
891       input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
892 
893       input_vgprs[2] =
894          LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
895       input_vgprs[5] =
896          LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
897       input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
898    }
899 
900    unsigned vertex_id_vgpr = first_vs_vgpr;
901    unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
902                                   ? first_vs_vgpr + 3
903                                   : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
904 
905    ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
906    ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
907 
908    /* InstanceID = VertexID >> 16;
909     * VertexID   = VertexID & 0xffff;
910     */
911    if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
912       ctx->abi.instance_id =
913          LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
914       ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
915                                         LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
916    }
917 
918    /* Copy inputs to outputs. This should be no-op, as the registers match,
919     * but it will prevent the compiler from overwriting them unintentionally.
920     */
921    ret = ctx->return_value;
922    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
923       LLVMValueRef p = LLVMGetParam(func, i);
924       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
925    }
926    for (i = 0; i < num_input_vgprs; i++) {
927       LLVMValueRef p = input_vgprs[i];
928 
929       if (i == vertex_id_vgpr)
930          p = ctx->abi.vertex_id;
931       else if (i == instance_id_vgpr)
932          p = ctx->abi.instance_id;
933 
934       p = ac_to_float(&ctx->ac, p);
935       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
936    }
937 
938    /* Compute vertex load indices from instance divisors. */
939    LLVMValueRef instance_divisor_constbuf = NULL;
940 
941    if (key->vs_prolog.states.instance_divisor_is_fetched) {
942       LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
943       LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
944       instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
945    }
946 
947    for (i = 0; i < key->vs_prolog.num_inputs; i++) {
948       bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
949       bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
950       LLVMValueRef index = NULL;
951 
952       if (divisor_is_one) {
953          index = ctx->abi.instance_id;
954       } else if (divisor_is_fetched) {
955          LLVMValueRef udiv_factors[4];
956 
957          for (unsigned j = 0; j < 4; j++) {
958             udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
959                                                    LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
960             udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
961          }
962          /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
963           * Such InstanceID might not be achievable in a reasonable time though.
964           */
965          index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
966                                         udiv_factors[1], udiv_factors[2], udiv_factors[3]);
967       }
968 
969       if (divisor_is_one || divisor_is_fetched) {
970          /* Add StartInstance. */
971          index =
972             LLVMBuildAdd(ctx->ac.builder, index,
973                          LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
974       } else {
975          /* VertexID + BaseVertex */
976          index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
977                               LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
978       }
979 
980       index = ac_to_float(&ctx->ac, index);
981       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
982    }
983 
984    si_llvm_build_ret(ctx, ret);
985 }
986 
get_base_vertex(struct ac_shader_abi * abi)987 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
988 {
989    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
990 
991    /* For non-indexed draws, the base vertex set by the driver
992     * (for direct draws) or the CP (for indirect draws) is the
993     * first vertex ID, but GLSL expects 0 to be returned.
994     */
995    LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
996    LLVMValueRef indexed;
997 
998    indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
999    indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
1000 
1001    return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1002                           ctx->ac.i32_0, "");
1003 }
1004 
si_llvm_init_vs_callbacks(struct si_shader_context * ctx,bool ngg_cull_shader)1005 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1006 {
1007    struct si_shader *shader = ctx->shader;
1008 
1009    if (shader->key.as_ls)
1010       ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
1011    else if (shader->key.as_es)
1012       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1013    else if (shader->key.opt.vs_as_prim_discard_cs)
1014       ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
1015    else if (ngg_cull_shader)
1016       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
1017    else if (shader->key.as_ngg)
1018       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1019    else
1020       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1021 
1022    ctx->abi.load_base_vertex = get_base_vertex;
1023 }
1024