• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 
si_get_rel_patch_id(struct si_shader_context * ctx)29 LLVMValueRef si_get_rel_patch_id(struct si_shader_context *ctx)
30 {
31    switch (ctx->stage) {
32    case MESA_SHADER_TESS_CTRL:
33       return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
34 
35    case MESA_SHADER_TESS_EVAL:
36       return ac_get_arg(&ctx->ac, ctx->args.tes_rel_patch_id);
37 
38    default:
39       assert(0);
40       return NULL;
41    }
42 }
43 
44 /* Tessellation shaders pass outputs to the next shader using LDS.
45  *
46  * LS outputs = TCS inputs
47  * TCS outputs = TES inputs
48  *
49  * The LDS layout is:
50  * - TCS inputs for patch 0
51  * - TCS inputs for patch 1
52  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
53  * - ...
54  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
55  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
56  * - TCS outputs for patch 1
57  * - Per-patch TCS outputs for patch 1
58  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
59  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
60  * - ...
61  *
62  * All three shaders VS(LS), TCS, TES share the same LDS space.
63  */
64 
get_tcs_out_vertex_dw_stride_constant(struct si_shader_context * ctx)65 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
66 {
67    assert(ctx->stage == MESA_SHADER_TESS_CTRL);
68 
69    return util_last_bit64(ctx->shader->selector->info.outputs_written) * 4;
70 }
71 
get_tcs_out_patch_stride(struct si_shader_context * ctx)72 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
73 {
74    const struct si_shader_info *info = &ctx->shader->selector->info;
75    unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
76    unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
77    unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->info.patch_outputs_written);
78    unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
79    return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
80 }
81 
get_tcs_out_patch0_patch_data_offset(struct si_shader_context * ctx)82 static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
83 {
84    return si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16);
85 }
86 
get_tcs_out_current_patch_data_offset(struct si_shader_context * ctx)87 static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
88 {
89    LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
90    LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
91    LLVMValueRef rel_patch_id = si_get_rel_patch_id(ctx);
92 
93    return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
94 }
95 
si_get_num_tcs_out_vertices(struct si_shader_context * ctx)96 LLVMValueRef si_get_num_tcs_out_vertices(struct si_shader_context *ctx)
97 {
98    unsigned tcs_out_vertices =
99       ctx->shader->selector ? ctx->shader->selector->info.base.tess.tcs_vertices_out
100                             : 0;
101 
102    /* If !tcs_out_vertices, it's the TCS epilog. */
103    if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices)
104       return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
105 
106    return LLVMBuildAdd(ctx->ac.builder,
107                        si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 5), ctx->ac.i32_1, "");
108 }
109 
si_get_tcs_in_vertex_dw_stride(struct si_shader_context * ctx)110 LLVMValueRef si_get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
111 {
112    unsigned stride;
113 
114    switch (ctx->stage) {
115    case MESA_SHADER_VERTEX:
116       stride = ctx->shader->selector->info.lshs_vertex_stride / 4;
117       return LLVMConstInt(ctx->ac.i32, stride, 0);
118 
119    case MESA_SHADER_TESS_CTRL:
120       if (ctx->screen->info.gfx_level >= GFX9 && ctx->shader->is_monolithic) {
121          stride = ctx->shader->key.ge.part.tcs.ls->info.lshs_vertex_stride / 4;
122          return LLVMConstInt(ctx->ac.i32, stride, 0);
123       }
124       return GET_FIELD(ctx, VS_STATE_LS_OUT_VERTEX_SIZE);
125 
126    default:
127       assert(0);
128       return NULL;
129    }
130 }
131 
132 /* The offchip buffer layout for TCS->TES is
133  *
134  * - attribute 0 of patch 0 vertex 0
135  * - attribute 0 of patch 0 vertex 1
136  * - attribute 0 of patch 0 vertex 2
137  *   ...
138  * - attribute 0 of patch 1 vertex 0
139  * - attribute 0 of patch 1 vertex 1
140  *   ...
141  * - attribute 1 of patch 0 vertex 0
142  * - attribute 1 of patch 0 vertex 1
143  *   ...
144  * - per patch attribute 0 of patch 0
145  * - per patch attribute 0 of patch 1
146  *   ...
147  *
148  * Note that every attribute has 4 components.
149  */
get_tcs_tes_buffer_address(struct si_shader_context * ctx,LLVMValueRef rel_patch_id,LLVMValueRef vertex_index,LLVMValueRef param_index)150 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
151                                                LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
152                                                LLVMValueRef param_index)
153 {
154    LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
155    LLVMValueRef param_stride, constant16;
156 
157    vertices_per_patch = si_get_num_tcs_out_vertices(ctx);
158    num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
159    num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, "");
160    total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
161 
162    constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
163    if (vertex_index) {
164       base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
165       param_stride = total_vertices;
166    } else {
167       base_addr = rel_patch_id;
168       param_stride = num_patches;
169    }
170 
171    base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
172    base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
173 
174    if (!vertex_index) {
175       LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 11, 21);
176 
177       base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
178    }
179    return base_addr;
180 }
181 
182 /**
183  * Load from LSHS LDS storage.
184  *
185  * \param type     output value type
186  * \param swizzle  offset (typically 0..3); it can be ~0, which loads a vec4
187  * \param dw_addr  address in dwords
188  */
lshs_lds_load(struct si_shader_context * ctx,LLVMTypeRef type,unsigned swizzle,LLVMValueRef dw_addr)189 static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
190                                   LLVMValueRef dw_addr)
191 {
192    LLVMValueRef value;
193 
194    if (swizzle == ~0) {
195       LLVMValueRef values[4];
196 
197       for (unsigned chan = 0; chan < 4; chan++)
198          values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
199 
200       return ac_build_gather_values(&ctx->ac, values, 4);
201    }
202 
203    dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
204    value = ac_lds_load(&ctx->ac, dw_addr);
205    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
206 }
207 
208 enum si_tess_ring
209 {
210    TCS_FACTOR_RING,
211    TESS_OFFCHIP_RING_TCS,
212    TESS_OFFCHIP_RING_TES,
213 };
214 
get_tess_ring_descriptor(struct si_shader_context * ctx,enum si_tess_ring ring)215 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
216 {
217    LLVMBuilderRef builder = ctx->ac.builder;
218    LLVMValueRef addr = ac_get_arg(
219       &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
220 
221    /* TCS only receives high 13 bits of the address. */
222    if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
223       addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
224    }
225 
226    if (ring == TCS_FACTOR_RING) {
227       unsigned tf_offset = ctx->screen->hs.tess_offchip_ring_size;
228       addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
229    }
230 
231    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
232                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
233 
234    if (ctx->screen->info.gfx_level >= GFX11)
235       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
236                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
237    else if (ctx->screen->info.gfx_level >= GFX10)
238       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
239                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
240    else
241       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
242                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
243 
244    LLVMValueRef desc[4];
245    desc[0] = addr;
246    desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
247    desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
248    desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
249 
250    return ac_build_gather_values(&ctx->ac, desc, 4);
251 }
252 
si_llvm_preload_tess_rings(struct si_shader_context * ctx)253 void si_llvm_preload_tess_rings(struct si_shader_context *ctx)
254 {
255    ctx->tess_offchip_ring = get_tess_ring_descriptor(
256       ctx, ctx->stage == MESA_SHADER_TESS_CTRL ? TESS_OFFCHIP_RING_TCS : TESS_OFFCHIP_RING_TES);
257 }
258 
si_nir_load_tcs_varyings(struct ac_shader_abi * abi,LLVMTypeRef type,LLVMValueRef vertex_index,LLVMValueRef param_index,unsigned driver_location,unsigned component,unsigned num_components,bool load_input)259 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
260                                              LLVMValueRef vertex_index, LLVMValueRef param_index,
261                                              unsigned driver_location, unsigned component,
262                                              unsigned num_components, bool load_input)
263 {
264    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
265    struct si_shader_info *info = &ctx->shader->selector->info;
266 
267    assert(ctx->shader->key.ge.opt.same_patch_vertices && !param_index);
268 
269    ubyte semantic = info->input[driver_location].semantic;
270    /* Load the TCS input from a VGPR. */
271    unsigned func_param = ctx->args.tcs_rel_ids.arg_index + 1 +
272       si_shader_io_get_unique_index(semantic, false) * 4;
273 
274    LLVMValueRef value[4];
275    for (unsigned i = component; i < component + num_components; i++) {
276       value[i] = LLVMGetParam(ctx->main_fn, func_param + i);
277       value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
278    }
279 
280    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
281 }
282 
si_write_tess_factors(struct si_shader_context * ctx,union si_shader_part_key * key,LLVMValueRef rel_patch_id,LLVMValueRef invocation_id,LLVMValueRef tcs_out_current_patch_data_offset,LLVMValueRef invoc0_tf_outer[4],LLVMValueRef invoc0_tf_inner[2])283 static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader_part_key *key,
284                                   LLVMValueRef rel_patch_id, LLVMValueRef invocation_id,
285                                   LLVMValueRef tcs_out_current_patch_data_offset,
286                                   LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
287 {
288    struct si_shader *shader = ctx->shader;
289    unsigned tess_inner_index, tess_outer_index;
290    LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
291    LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
292    unsigned stride, outer_comps, inner_comps, i, offset;
293 
294    /* Add a barrier before loading tess factors from LDS. */
295    if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
296       ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
297 
298       if (!key->tcs_epilog.noop_s_barrier)
299          ac_build_s_barrier(&ctx->ac, ctx->stage);
300    }
301 
302    /* Do this only for invocation 0, because the tess levels are per-patch,
303     * not per-vertex.
304     *
305     * This can't jump, because invocation 0 executes this. It should
306     * at least mask out the loads and stores for other invocations.
307     */
308    ac_build_ifcc(&ctx->ac,
309                  LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
310 
311    /* Determine the layout of one tess factor element in the buffer. */
312    switch (shader->key.ge.part.tcs.epilog.prim_mode) {
313    case TESS_PRIMITIVE_ISOLINES:
314       stride = 2; /* 2 dwords, 1 vec2 store */
315       outer_comps = 2;
316       inner_comps = 0;
317       break;
318    case TESS_PRIMITIVE_TRIANGLES:
319       stride = 4; /* 4 dwords, 1 vec4 store */
320       outer_comps = 3;
321       inner_comps = 1;
322       break;
323    case TESS_PRIMITIVE_QUADS:
324       stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
325       outer_comps = 4;
326       inner_comps = 2;
327       break;
328    default:
329       assert(0);
330       return;
331    }
332 
333    for (i = 0; i < 4; i++) {
334       inner[i] = LLVMGetUndef(ctx->ac.i32);
335       outer[i] = LLVMGetUndef(ctx->ac.i32);
336    }
337 
338    if (shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
339       /* Tess factors are in VGPRs. */
340       for (i = 0; i < outer_comps; i++)
341          outer[i] = out[i] = invoc0_tf_outer[i];
342       for (i = 0; i < inner_comps; i++)
343          inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
344    } else {
345       /* Load tess_inner and tess_outer from LDS.
346        * Any invocation can write them, so we can't get them from a temporary.
347        */
348       tess_inner_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
349       tess_outer_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
350 
351       lds_base = tcs_out_current_patch_data_offset;
352       lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
353                                LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
354       lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
355                                LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
356 
357       for (i = 0; i < outer_comps; i++) {
358          outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
359       }
360       for (i = 0; i < inner_comps; i++) {
361          inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
362       }
363    }
364 
365    if (shader->key.ge.part.tcs.epilog.prim_mode == TESS_PRIMITIVE_ISOLINES) {
366       /* For isolines, the hardware expects tess factors in the
367        * reverse order from what NIR specifies.
368        */
369       LLVMValueRef tmp = out[0];
370       out[0] = out[1];
371       out[1] = tmp;
372    }
373 
374    /* Convert the outputs to vectors for stores. */
375    vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
376    vec1 = NULL;
377 
378    if (stride > 4)
379       vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
380 
381    /* Get the buffer. */
382    buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
383 
384    /* Get the offset. */
385    tf_base = ac_get_arg(&ctx->ac, ctx->args.tcs_factor_offset);
386    byteoffset =
387       LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
388    offset = 0;
389 
390    /* Store the dynamic HS control word. */
391    if (ctx->screen->info.gfx_level <= GFX8) {
392       ac_build_ifcc(&ctx->ac,
393                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
394       ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
395                                   NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base, ac_glc);
396       ac_build_endif(&ctx->ac, 6504);
397       offset += 4;
398    }
399 
400    /* Store the tessellation factors. */
401    ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, NULL,
402                                LLVMBuildAdd(ctx->ac.builder, byteoffset,
403                                             LLVMConstInt(ctx->ac.i32, offset, 0), ""),
404                                tf_base, ac_glc);
405    offset += 16;
406    if (vec1)
407       ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, NULL,
408                                   LLVMBuildAdd(ctx->ac.builder, byteoffset,
409                                                LLVMConstInt(ctx->ac.i32, offset, 0), ""),
410                                   tf_base, ac_glc);
411 
412    /* Store the tess factors into the offchip buffer if TES reads them. */
413    if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) {
414       LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
415       LLVMValueRef tf_inner_offset;
416       unsigned param_outer, param_inner;
417 
418       buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
419       base = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
420 
421       param_outer = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
422       tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
423                                                    LLVMConstInt(ctx->ac.i32, param_outer, 0));
424 
425       outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_comps);
426 
427       ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, NULL, tf_outer_offset,
428                                   base, ac_glc);
429       if (inner_comps) {
430          param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
431          tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
432                                                       LLVMConstInt(ctx->ac.i32, param_inner, 0));
433 
434          inner_vec = ac_build_gather_values(&ctx->ac, inner, inner_comps);
435          ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, NULL,
436                                      tf_inner_offset, base, ac_glc);
437       }
438    }
439 
440    ac_build_endif(&ctx->ac, 6503);
441 }
442 
443 /* This only writes the tessellation factor levels. */
si_llvm_tcs_build_end(struct si_shader_context * ctx)444 void si_llvm_tcs_build_end(struct si_shader_context *ctx)
445 {
446    LLVMBuilderRef builder = ctx->ac.builder;
447    LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
448 
449    rel_patch_id = si_get_rel_patch_id(ctx);
450    invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
451    tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
452 
453    if (ctx->screen->info.gfx_level >= GFX9 && !ctx->shader->is_monolithic) {
454       LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
455       LLVMValueRef values[2];
456 
457       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
458 
459       values[0] = rel_patch_id;
460       values[1] = LLVMGetUndef(ctx->ac.i32);
461       rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
462 
463       values[0] = tf_lds_offset;
464       values[1] = LLVMGetUndef(ctx->ac.i32);
465       tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
466 
467       values[0] = invocation_id;
468       values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
469       invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
470    }
471 
472    /* Return epilog parameters from this function. */
473    LLVMValueRef ret = ctx->return_value;
474    unsigned vgpr;
475 
476    if (ctx->screen->info.gfx_level >= GFX9) {
477       ret =
478          si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
479       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
480       /* Tess offchip and tess factor offsets are at the beginning. */
481       ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 2);
482       ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, 4);
483       vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
484    } else {
485       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
486       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
487       /* Tess offchip and tess factor offsets are after user SGPRs. */
488       ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
489       ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
490       vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
491    }
492 
493    /* VGPRs */
494    rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
495    invocation_id = ac_to_float(&ctx->ac, invocation_id);
496    tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
497 
498    /* Leave a hole corresponding to the two input VGPRs. This ensures that
499     * the invocation_id output does not alias the tcs_rel_ids input,
500     * which saves a V_MOV on gfx9.
501     */
502    vgpr += 2;
503 
504    ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
505    ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
506 
507    struct si_shader_info *info = &ctx->shader->selector->info;
508    if (info->tessfactors_are_def_in_all_invocs) {
509       vgpr++; /* skip the tess factor LDS offset */
510 
511       /* get tess factor driver location */
512       int outer_loc = -1;
513       int inner_loc = -1;
514       for (int i = 0; i < info->num_outputs; i++) {
515          unsigned semantic = info->output_semantic[i];
516          if (semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
517             outer_loc = i;
518          else if (semantic == VARYING_SLOT_TESS_LEVEL_INNER)
519             inner_loc = i;
520       }
521 
522       for (unsigned i = 0; i < 6; i++) {
523          int loc = i < 4 ? outer_loc : inner_loc;
524          LLVMValueRef value = loc < 0 ? LLVMGetUndef(ctx->ac.f32) :
525             LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[loc * 4 + i % 4], "");
526          value = ac_to_float(&ctx->ac, value);
527          ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
528       }
529    } else {
530       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
531    }
532    ctx->return_value = ret;
533 }
534 
535 /* Pass TCS inputs from LS to TCS on GFX9. */
si_set_ls_return_value_for_tcs(struct si_shader_context * ctx)536 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
537 {
538    if (!ctx->shader->is_monolithic)
539       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
540 
541    LLVMValueRef ret = ctx->return_value;
542 
543    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
544    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
545    ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 2);
546    ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
547    ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, 4);
548    if (ctx->screen->info.gfx_level <= GFX10_3)
549       ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
550 
551    ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
552    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
553                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
554 
555    ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
556 
557    ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
558    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
559    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
560 
561    unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
562    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
563                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
564                               vgpr++, "");
565    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
566                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
567                               vgpr++, "");
568    ctx->return_value = ret;
569 }
570 
si_llvm_ls_build_end(struct si_shader_context * ctx)571 void si_llvm_ls_build_end(struct si_shader_context *ctx)
572 {
573    struct si_shader *shader = ctx->shader;
574    struct si_shader_info *info = &shader->selector->info;
575    LLVMValueRef *addrs = ctx->abi.outputs;
576    unsigned ret_offset = 8 + GFX9_TCS_NUM_USER_SGPR + 2;
577 
578    if (shader->key.ge.opt.same_patch_vertices) {
579       for (unsigned i = 0; i < info->num_outputs; i++) {
580          unsigned semantic = info->output_semantic[i];
581          int param = si_shader_io_get_unique_index(semantic, false);
582 
583          for (unsigned chan = 0; chan < 4; chan++) {
584             if (!(info->output_usagemask[i] & (1 << chan)))
585                continue;
586 
587             LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + chan], "");
588 
589             ctx->return_value = LLVMBuildInsertValue(ctx->ac.builder, ctx->return_value,
590                                                      value, ret_offset + param * 4 + chan, "");
591          }
592       }
593    }
594 
595    if (ctx->screen->info.gfx_level >= GFX9)
596       si_set_ls_return_value_for_tcs(ctx);
597 }
598 
599 /**
600  * Compile the TCS epilog function. This writes tesselation factors to memory
601  * based on the output primitive type of the tesselator (determined by TES).
602  */
si_llvm_build_tcs_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)603 void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
604 {
605    memset(&ctx->args, 0, sizeof(ctx->args));
606 
607    if (ctx->screen->info.gfx_level >= GFX9) {
608       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
609       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
610       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
611       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
612       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
613       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
614       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
615       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
616       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
617       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
618       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
619       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
620       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
621       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
622       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
623       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
624       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
625       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
626       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
627    } else {
628       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
629       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
630       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
631       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
632       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
633       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
634       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
635       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
636       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
637       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
638    }
639 
640    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
641    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
642    struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
643    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
644    struct ac_arg invocation_id; /* invocation ID within the patch */
645    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
646    struct ac_arg
647       tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
648    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
649 
650    struct ac_arg tess_factors[6];
651    for (unsigned i = 0; i < 6; i++)
652       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
653 
654    /* Create the function. */
655    si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.gfx_level >= GFX7 ? 128 : 0);
656    ac_declare_lds_as_pointer(&ctx->ac);
657 
658    LLVMValueRef invoc0_tess_factors[6];
659    for (unsigned i = 0; i < 6; i++)
660       invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
661 
662    si_write_tess_factors(ctx, key, ac_get_arg(&ctx->ac, rel_patch_id),
663                          ac_get_arg(&ctx->ac, invocation_id),
664                          ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
665                          invoc0_tess_factors, invoc0_tess_factors + 4);
666 
667    LLVMBuildRetVoid(ctx->ac.builder);
668 }
669 
si_llvm_init_tcs_callbacks(struct si_shader_context * ctx)670 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
671 {
672    ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
673 }
674