• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "nir/nir.h"
29 #include "radv_debug.h"
30 #include "radv_llvm_helper.h"
31 #include "radv_private.h"
32 #include "radv_shader.h"
33 #include "radv_shader_args.h"
34 
35 #include "ac_binary.h"
36 #include "ac_nir.h"
37 #include "ac_llvm_build.h"
38 #include "ac_nir_to_llvm.h"
39 #include "ac_shader_abi.h"
40 #include "ac_shader_util.h"
41 #include "sid.h"
42 
43 struct radv_shader_context {
44    struct ac_llvm_context ac;
45    const struct nir_shader *shader;
46    struct ac_shader_abi abi;
47    const struct radv_nir_compiler_options *options;
48    const struct radv_shader_info *shader_info;
49    const struct radv_shader_args *args;
50 
51    gl_shader_stage stage;
52 
53    unsigned max_workgroup_size;
54    LLVMContextRef context;
55    LLVMValueRef main_function;
56 
57    LLVMValueRef descriptor_sets[MAX_SETS];
58 
59    LLVMValueRef ring_offsets;
60 
61    LLVMValueRef vs_rel_patch_id;
62 
63    LLVMValueRef gs_wave_id;
64 
65    LLVMValueRef esgs_ring;
66    LLVMValueRef gsvs_ring[4];
67    LLVMValueRef hs_ring_tess_offchip;
68    LLVMValueRef hs_ring_tess_factor;
69 
70    uint64_t output_mask;
71 };
72 
73 struct radv_shader_output_values {
74    LLVMValueRef values[4];
75    unsigned slot_name;
76    unsigned slot_index;
77    unsigned usage_mask;
78 };
79 
80 static inline struct radv_shader_context *
radv_shader_context_from_abi(struct ac_shader_abi * abi)81 radv_shader_context_from_abi(struct ac_shader_abi *abi)
82 {
83    return container_of(abi, struct radv_shader_context, abi);
84 }
85 
86 static LLVMValueRef
create_llvm_function(struct ac_llvm_context * ctx,LLVMModuleRef module,LLVMBuilderRef builder,const struct ac_shader_args * args,enum ac_llvm_calling_convention convention,unsigned max_workgroup_size,const struct radv_nir_compiler_options * options)87 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
88                      const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
89                      unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
90 {
91    LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
92 
93    if (options->address32_hi) {
94       ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
95                                            options->address32_hi);
96    }
97 
98    ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
99    ac_llvm_set_target_features(main_function, ctx);
100 
101    return main_function;
102 }
103 
104 static void
load_descriptor_sets(struct radv_shader_context * ctx)105 load_descriptor_sets(struct radv_shader_context *ctx)
106 {
107    const struct radv_userdata_locations *user_sgprs_locs = &ctx->shader_info->user_sgprs_locs;
108    uint32_t mask = ctx->shader_info->desc_set_used_mask;
109 
110    if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
111       LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
112       while (mask) {
113          int i = u_bit_scan(&mask);
114 
115          ctx->descriptor_sets[i] =
116             ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
117          LLVMSetAlignment(ctx->descriptor_sets[i], 4);
118       }
119    } else {
120       while (mask) {
121          int i = u_bit_scan(&mask);
122 
123          ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
124       }
125    }
126 }
127 
128 static enum ac_llvm_calling_convention
get_llvm_calling_convention(LLVMValueRef func,gl_shader_stage stage)129 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
130 {
131    switch (stage) {
132    case MESA_SHADER_VERTEX:
133    case MESA_SHADER_TESS_EVAL:
134       return AC_LLVM_AMDGPU_VS;
135       break;
136    case MESA_SHADER_GEOMETRY:
137       return AC_LLVM_AMDGPU_GS;
138       break;
139    case MESA_SHADER_TESS_CTRL:
140       return AC_LLVM_AMDGPU_HS;
141       break;
142    case MESA_SHADER_FRAGMENT:
143       return AC_LLVM_AMDGPU_PS;
144       break;
145    case MESA_SHADER_COMPUTE:
146       return AC_LLVM_AMDGPU_CS;
147       break;
148    default:
149       unreachable("Unhandle shader type");
150    }
151 }
152 
153 /* Returns whether the stage is a stage that can be directly before the GS */
154 static bool
is_pre_gs_stage(gl_shader_stage stage)155 is_pre_gs_stage(gl_shader_stage stage)
156 {
157    return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
158 }
159 
160 static void
create_function(struct radv_shader_context * ctx,gl_shader_stage stage,bool has_previous_stage)161 create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
162 {
163    if (ctx->ac.gfx_level >= GFX10) {
164       if (is_pre_gs_stage(stage) && ctx->shader_info->is_ngg) {
165          /* On GFX10+, VS and TES are merged into GS for NGG. */
166          stage = MESA_SHADER_GEOMETRY;
167          has_previous_stage = true;
168       }
169    }
170 
171    ctx->main_function =
172       create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
173                            get_llvm_calling_convention(ctx->main_function, stage),
174                            ctx->max_workgroup_size, ctx->options);
175 
176    ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
177                                           LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
178                                           AC_FUNC_ATTR_READNONE);
179    ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
180                                         ac_array_in_const_addr_space(ctx->ac.v4i32), "");
181 
182    load_descriptor_sets(ctx);
183 
184    if (stage == MESA_SHADER_TESS_CTRL ||
185        (stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_ls) ||
186        ctx->shader_info->is_ngg ||
187        /* GFX9 has the ESGS ring buffer in LDS. */
188        (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
189       ac_declare_lds_as_pointer(&ctx->ac);
190    }
191 }
192 
193 static uint32_t
radv_get_sample_pos_offset(uint32_t num_samples)194 radv_get_sample_pos_offset(uint32_t num_samples)
195 {
196    uint32_t sample_pos_offset = 0;
197 
198    switch (num_samples) {
199    case 2:
200       sample_pos_offset = 1;
201       break;
202    case 4:
203       sample_pos_offset = 3;
204       break;
205    case 8:
206       sample_pos_offset = 7;
207       break;
208    default:
209       break;
210    }
211    return sample_pos_offset;
212 }
213 
214 static LLVMValueRef
load_sample_position(struct ac_shader_abi * abi,LLVMValueRef sample_id)215 load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
216 {
217    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
218 
219    LLVMValueRef result;
220    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
221    LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
222 
223    ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
224 
225    uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->options->key.ps.num_samples);
226 
227    sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
228                             LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
229    result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
230 
231    return result;
232 }
233 
234 static void
visit_emit_vertex_with_counter(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)235 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
236                                LLVMValueRef *addrs)
237 {
238    unsigned offset = 0;
239    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
240 
241    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
242       unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
243       uint8_t output_stream = ctx->shader_info->gs.output_streams[i];
244       LLVMValueRef *out_ptr = &addrs[i * 4];
245       int length = util_last_bit(output_usage_mask);
246 
247       if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
248          continue;
249 
250       for (unsigned j = 0; j < length; j++) {
251          if (!(output_usage_mask & (1 << j)))
252             continue;
253 
254          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
255          LLVMValueRef voffset =
256             LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
257 
258          offset++;
259 
260          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
261          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
262 
263          out_val = ac_to_integer(&ctx->ac, out_val);
264          out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
265 
266          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, NULL, voffset,
267                                      ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset),
268                                      ac_glc | ac_slc | ac_swizzled);
269       }
270    }
271 
272    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
273                     ctx->gs_wave_id);
274 }
275 
276 static void
visit_end_primitive(struct ac_shader_abi * abi,unsigned stream)277 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
278 {
279    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
280    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
281                     ctx->gs_wave_id);
282 }
283 
284 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)285 radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
286 {
287    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
288    return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
289 }
290 
291 static LLVMValueRef
radv_load_rsrc(struct radv_shader_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)292 radv_load_rsrc(struct radv_shader_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
293 {
294    if (ptr && LLVMTypeOf(ptr) == ctx->ac.i32) {
295       LLVMValueRef result;
296 
297       LLVMTypeRef ptr_type = LLVMPointerType(type, AC_ADDR_SPACE_CONST_32BIT);
298       ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, ptr_type, "");
299       LLVMSetMetadata(ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
300 
301       result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
302       LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
303 
304       return result;
305    }
306 
307    return ptr;
308 }
309 
310 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr)311 radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
312 {
313    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
314    return radv_load_rsrc(ctx, buffer_ptr, ctx->ac.v4i32);
315 }
316 
317 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr,bool write,bool non_uniform)318 radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
319 {
320    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
321    return radv_load_rsrc(ctx, buffer_ptr, ctx->ac.v4i32);
322 }
323 
324 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi * abi,unsigned descriptor_set,unsigned base_index,unsigned constant_index,LLVMValueRef index,enum ac_descriptor_type desc_type,bool image,bool write,bool bindless)325 radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
326                       unsigned constant_index, LLVMValueRef index,
327                       enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
328 {
329    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
330 
331    if (image && desc_type == AC_DESC_FMASK)
332       return NULL;
333 
334    /* 3 plane formats always have same size and format for plane 1 & 2, so
335     * use the tail from plane 1 so that we can store only the first 16 bytes
336     * of the last plane. */
337    if (desc_type == AC_DESC_PLANE_2 && index && LLVMTypeOf(index) == ctx->ac.i32) {
338       LLVMValueRef plane1_addr =
339          LLVMBuildSub(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 32, false), "");
340       LLVMValueRef descriptor1 = radv_load_rsrc(ctx, plane1_addr, ctx->ac.v8i32);
341       LLVMValueRef descriptor2 = radv_load_rsrc(ctx, index, ctx->ac.v4i32);
342 
343       LLVMValueRef components[8];
344       for (unsigned i = 0; i < 4; ++i)
345          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
346 
347       for (unsigned i = 4; i < 8; ++i)
348          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor1, i);
349       return ac_build_gather_values(&ctx->ac, components, 8);
350    }
351 
352    bool v4 = desc_type == AC_DESC_BUFFER || desc_type == AC_DESC_SAMPLER;
353    return radv_load_rsrc(ctx, index, v4 ? ctx->ac.v4i32 : ctx->ac.v8i32);
354 }
355 
356 static LLVMValueRef
radv_fixup_vertex_input_fetches(struct radv_shader_context * ctx,LLVMValueRef value,unsigned num_channels,bool is_float)357 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
358                                 unsigned num_channels, bool is_float)
359 {
360    LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
361    LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
362    LLVMValueRef chan[4];
363 
364    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
365       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
366 
367       if (num_channels == 4 && num_channels == vec_size)
368          return value;
369 
370       num_channels = MIN2(num_channels, vec_size);
371 
372       for (unsigned i = 0; i < num_channels; i++)
373          chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
374    } else {
375       assert(num_channels == 1);
376       chan[0] = value;
377    }
378 
379    for (unsigned i = num_channels; i < 4; i++) {
380       chan[i] = i == 3 ? one : zero;
381       chan[i] = ac_to_integer(&ctx->ac, chan[i]);
382    }
383 
384    return ac_build_gather_values(&ctx->ac, chan, 4);
385 }
386 
387 static void
load_vs_input(struct radv_shader_context * ctx,unsigned driver_location,LLVMTypeRef dest_type,LLVMValueRef out[4])388 load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
389               LLVMValueRef out[4])
390 {
391    LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
392    LLVMValueRef t_offset;
393    LLVMValueRef t_list;
394    LLVMValueRef input;
395    LLVMValueRef buffer_index;
396    unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
397    unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
398    unsigned data_format = attrib_format & 0x0f;
399    unsigned num_format = (attrib_format >> 4) & 0x07;
400    bool is_float =
401       num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
402    uint8_t input_usage_mask =
403       ctx->shader_info->vs.input_usage_mask[driver_location];
404    unsigned num_input_channels = util_last_bit(input_usage_mask);
405 
406    if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
407       uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
408 
409       if (divisor) {
410          buffer_index = ctx->abi.instance_id;
411 
412          if (divisor != 1) {
413             buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
414                                          LLVMConstInt(ctx->ac.i32, divisor, 0), "");
415          }
416       } else {
417          buffer_index = ctx->ac.i32_0;
418       }
419 
420       buffer_index = LLVMBuildAdd(
421          ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
422    } else {
423       buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
424                                   ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
425    }
426 
427    const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
428 
429    /* Adjust the number of channels to load based on the vertex attribute format. */
430    unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
431    unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
432    unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
433    unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
434 
435    unsigned desc_index =
436       ctx->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
437    desc_index = util_bitcount(ctx->shader_info->vs.vb_desc_usage_mask &
438                               u_bit_consecutive(0, desc_index));
439    t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
440    t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
441 
442    /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
443     * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
444     * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
445     * VBO offset is 2 for R16G16B16A16_SNORM).
446     */
447    if (ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10) {
448       unsigned chan_format = vtx_info->chan_format;
449       LLVMValueRef values[4];
450 
451       assert(ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10);
452 
453       for (unsigned chan = 0; chan < num_channels; chan++) {
454          unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
455          LLVMValueRef chan_index = buffer_index;
456 
457          if (attrib_stride != 0 && chan_offset > attrib_stride) {
458             LLVMValueRef buffer_offset =
459                LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
460 
461             chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
462 
463             chan_offset = chan_offset % attrib_stride;
464          }
465 
466          values[chan] = ac_build_struct_tbuffer_load(
467             &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
468             ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
469       }
470 
471       input = ac_build_gather_values(&ctx->ac, values, num_channels);
472    } else {
473       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
474          LLVMValueRef buffer_offset =
475             LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
476 
477          buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
478 
479          attrib_offset = attrib_offset % attrib_stride;
480       }
481 
482       input = ac_build_struct_tbuffer_load(
483          &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
484          ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
485    }
486 
487    input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
488 
489    for (unsigned chan = 0; chan < 4; chan++) {
490       LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
491       out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
492       if (dest_type == ctx->ac.i16 && is_float) {
493          out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
494          out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
495       }
496    }
497 
498    for (unsigned chan = 0; chan < 4; chan++) {
499       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
500       if (dest_type == ctx->ac.i16 && !is_float)
501          out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
502    }
503 }
504 
505 static LLVMValueRef
radv_load_vs_inputs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)506 radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
507                     unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
508 {
509    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
510    LLVMValueRef values[4];
511 
512    load_vs_input(ctx, driver_location, type, values);
513 
514    for (unsigned i = 0; i < 4; i++)
515       values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
516 
517    return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
518 }
519 
520 static void
prepare_interp_optimize(struct radv_shader_context * ctx,struct nir_shader * nir)521 prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
522 {
523    bool uses_center = false;
524    bool uses_centroid = false;
525    nir_foreach_shader_in_variable (variable, nir) {
526       if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
527           variable->data.sample)
528          continue;
529 
530       if (variable->data.centroid)
531          uses_centroid = true;
532       else
533          uses_center = true;
534    }
535 
536    ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
537    ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
538 
539    if (uses_center && uses_centroid) {
540       LLVMValueRef sel =
541          LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
542                        ctx->ac.i32_0, "");
543       ctx->abi.persp_centroid =
544          LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
545                          ctx->abi.persp_centroid, "");
546       ctx->abi.linear_centroid =
547          LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
548                          ctx->abi.linear_centroid, "");
549    }
550 }
551 
552 static void
scan_shader_output_decl(struct radv_shader_context * ctx,struct nir_variable * variable,struct nir_shader * shader,gl_shader_stage stage)553 scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
554                         struct nir_shader *shader, gl_shader_stage stage)
555 {
556    int idx = variable->data.driver_location;
557    unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
558    uint64_t mask_attribs;
559 
560    if (variable->data.compact) {
561       unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
562       attrib_count = (component_count + 3) / 4;
563    }
564 
565    mask_attribs = ((1ull << attrib_count) - 1) << idx;
566 
567    ctx->output_mask |= mask_attribs;
568 }
569 
570 /* Initialize arguments for the shader export intrinsic */
571 static void
si_llvm_init_export_args(struct radv_shader_context * ctx,LLVMValueRef * values,unsigned enabled_channels,unsigned target,struct ac_export_args * args)572 si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
573                          unsigned enabled_channels, unsigned target, struct ac_export_args *args)
574 {
575    /* Specify the channels that are enabled. */
576    args->enabled_channels = enabled_channels;
577 
578    /* Specify whether the EXEC mask represents the valid mask */
579    args->valid_mask = 0;
580 
581    /* Specify whether this is the last export */
582    args->done = 0;
583 
584    /* Specify the target we are exporting */
585    args->target = target;
586 
587    args->compr = false;
588    args->out[0] = LLVMGetUndef(ctx->ac.f32);
589    args->out[1] = LLVMGetUndef(ctx->ac.f32);
590    args->out[2] = LLVMGetUndef(ctx->ac.f32);
591    args->out[3] = LLVMGetUndef(ctx->ac.f32);
592 
593    if (!values)
594       return;
595 
596    bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
597    if (ctx->stage == MESA_SHADER_FRAGMENT) {
598       unsigned index = target - V_008DFC_SQ_EXP_MRT;
599       unsigned col_format = (ctx->options->key.ps.col_format >> (4 * index)) & 0xf;
600       bool is_int8 = (ctx->options->key.ps.is_int8 >> index) & 1;
601       bool is_int10 = (ctx->options->key.ps.is_int10 >> index) & 1;
602       bool enable_mrt_output_nan_fixup = (ctx->options->key.ps.enable_mrt_output_nan_fixup >> index) & 1;
603 
604       LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
605       LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
606                             bool hi) = NULL;
607 
608       switch (col_format) {
609       case V_028714_SPI_SHADER_ZERO:
610          args->enabled_channels = 0; /* writemask */
611          args->target = V_008DFC_SQ_EXP_NULL;
612          break;
613 
614       case V_028714_SPI_SHADER_32_R:
615          args->enabled_channels = 1;
616          args->out[0] = values[0];
617          break;
618 
619       case V_028714_SPI_SHADER_32_GR:
620          args->enabled_channels = 0x3;
621          args->out[0] = values[0];
622          args->out[1] = values[1];
623          break;
624 
625       case V_028714_SPI_SHADER_32_AR:
626          if (ctx->ac.gfx_level >= GFX10) {
627             args->enabled_channels = 0x3;
628             args->out[0] = values[0];
629             args->out[1] = values[3];
630          } else {
631             args->enabled_channels = 0x9;
632             args->out[0] = values[0];
633             args->out[3] = values[3];
634          }
635          break;
636 
637       case V_028714_SPI_SHADER_FP16_ABGR:
638          args->enabled_channels = 0xf;
639          packf = ac_build_cvt_pkrtz_f16;
640          if (is_16bit) {
641             for (unsigned chan = 0; chan < 4; chan++)
642                values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
643          }
644          break;
645 
646       case V_028714_SPI_SHADER_UNORM16_ABGR:
647          args->enabled_channels = 0xf;
648          packf = ac_build_cvt_pknorm_u16;
649          break;
650 
651       case V_028714_SPI_SHADER_SNORM16_ABGR:
652          args->enabled_channels = 0xf;
653          packf = ac_build_cvt_pknorm_i16;
654          break;
655 
656       case V_028714_SPI_SHADER_UINT16_ABGR:
657          args->enabled_channels = 0xf;
658          packi = ac_build_cvt_pk_u16;
659          if (is_16bit) {
660             for (unsigned chan = 0; chan < 4; chan++)
661                values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
662                                             ctx->ac.i32, "");
663          }
664          break;
665 
666       case V_028714_SPI_SHADER_SINT16_ABGR:
667          args->enabled_channels = 0xf;
668          packi = ac_build_cvt_pk_i16;
669          if (is_16bit) {
670             for (unsigned chan = 0; chan < 4; chan++)
671                values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
672                                             ctx->ac.i32, "");
673          }
674          break;
675 
676       default:
677       case V_028714_SPI_SHADER_32_ABGR:
678          memcpy(&args->out[0], values, sizeof(values[0]) * 4);
679          break;
680       }
681 
682       /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
683       if (enable_mrt_output_nan_fixup && !is_16bit) {
684          for (unsigned i = 0; i < 4; i++) {
685             LLVMValueRef class_args[2] = {values[i],
686                                           LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
687             LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
688                                                     class_args, 2, AC_FUNC_ATTR_READNONE);
689             values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
690          }
691       }
692 
693       /* Pack f16 or norm_i16/u16. */
694       if (packf) {
695          for (unsigned chan = 0; chan < 2; chan++) {
696             LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
697             LLVMValueRef packed;
698 
699             packed = packf(&ctx->ac, pack_args);
700             args->out[chan] = ac_to_float(&ctx->ac, packed);
701          }
702          args->compr = 1; /* COMPR flag */
703       }
704 
705       /* Pack i16/u16. */
706       if (packi) {
707          for (unsigned chan = 0; chan < 2; chan++) {
708             LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
709                                          ac_to_integer(&ctx->ac, values[2 * chan + 1])};
710             LLVMValueRef packed;
711 
712             packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
713             args->out[chan] = ac_to_float(&ctx->ac, packed);
714          }
715          args->compr = 1; /* COMPR flag */
716       }
717       return;
718    }
719 
720    if (is_16bit) {
721       for (unsigned chan = 0; chan < 4; chan++) {
722          values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
723          args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
724       }
725    } else
726       memcpy(&args->out[0], values, sizeof(values[0]) * 4);
727 
728    for (unsigned i = 0; i < 4; ++i)
729       args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
730 }
731 
732 static void
radv_export_param(struct radv_shader_context * ctx,unsigned index,LLVMValueRef * values,unsigned enabled_channels)733 radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
734                   unsigned enabled_channels)
735 {
736    struct ac_export_args args;
737 
738    si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
739    ac_build_export(&ctx->ac, &args);
740 }
741 
742 static LLVMValueRef
radv_load_output(struct radv_shader_context * ctx,unsigned index,unsigned chan)743 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
744 {
745    LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
746    return LLVMBuildLoad(ctx->ac.builder, output, "");
747 }
748 
749 static void
radv_emit_stream_output(struct radv_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,const struct radv_stream_output * output,struct radv_shader_output_values * shader_out)750 radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
751                         LLVMValueRef const *so_write_offsets,
752                         const struct radv_stream_output *output,
753                         struct radv_shader_output_values *shader_out)
754 {
755    unsigned num_comps = util_bitcount(output->component_mask);
756    unsigned buf = output->buffer;
757    unsigned offset = output->offset;
758    unsigned start;
759    LLVMValueRef out[4];
760 
761    assert(num_comps && num_comps <= 4);
762    if (!num_comps || num_comps > 4)
763       return;
764 
765    /* Get the first component. */
766    start = ffs(output->component_mask) - 1;
767 
768    /* Load the output as int. */
769    for (int i = 0; i < num_comps; i++) {
770       out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
771    }
772 
773    /* Pack the output. */
774    LLVMValueRef vdata = NULL;
775 
776    switch (num_comps) {
777    case 1: /* as i32 */
778       vdata = out[0];
779       break;
780    case 2: /* as v2i32 */
781    case 3: /* as v3i32 */
782    case 4: /* as v4i32 */
783       vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
784       break;
785    }
786 
787    LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf],
788                                        LLVMConstInt(ctx->ac.i32, offset, 0), "");
789    ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, NULL, voffset, ctx->ac.i32_0,
790                                ac_glc | ac_slc);
791 }
792 
793 static void
radv_emit_streamout(struct radv_shader_context * ctx,unsigned stream)794 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
795 {
796    int i;
797 
798    /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
799    assert(ctx->args->ac.streamout_config.used);
800    LLVMValueRef so_vtx_count = ac_build_bfe(
801       &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
802       LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
803 
804    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
805 
806    /* can_emit = tid < so_vtx_count; */
807    LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
808 
809    /* Emit the streamout code conditionally. This actually avoids
810     * out-of-bounds buffer access. The hw tells us via the SGPR
811     * (so_vtx_count) which threads are allowed to emit streamout data.
812     */
813    ac_build_ifcc(&ctx->ac, can_emit, 6501);
814    {
815       /* The buffer offset is computed as follows:
816        *   ByteOffset = streamout_offset[buffer_id]*4 +
817        *                (streamout_write_index + thread_id)*stride[buffer_id] +
818        *                attrib_offset
819        */
820       LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
821 
822       /* Compute (streamout_write_index + thread_id). */
823       so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
824 
825       /* Load the descriptor and compute the write offset for each
826        * enabled buffer.
827        */
828       LLVMValueRef so_write_offset[4] = {0};
829       LLVMValueRef so_buffers[4] = {0};
830       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
831 
832       for (i = 0; i < 4; i++) {
833          uint16_t stride = ctx->shader_info->so.strides[i];
834 
835          if (!stride)
836             continue;
837 
838          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
839 
840          so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
841 
842          LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
843 
844          so_offset =
845             LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
846 
847          so_write_offset[i] = ac_build_imad(
848             &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
849       }
850 
851       /* Write streamout data. */
852       for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
853          struct radv_shader_output_values shader_out = {0};
854          const struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
855 
856          if (stream != output->stream)
857             continue;
858 
859          for (int j = 0; j < 4; j++) {
860             shader_out.values[j] = radv_load_output(ctx, output->location, j);
861          }
862 
863          radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
864       }
865    }
866    ac_build_endif(&ctx->ac, 6501);
867 }
868 
869 static void
radv_build_param_exports(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,const struct radv_vs_output_info * outinfo,bool export_clip_dists)870 radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
871                          unsigned noutput, const struct radv_vs_output_info *outinfo,
872                          bool export_clip_dists)
873 {
874    for (unsigned i = 0; i < noutput; i++) {
875       unsigned slot_name = outputs[i].slot_name;
876       unsigned usage_mask = outputs[i].usage_mask;
877 
878       if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
879           slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
880           slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
881          continue;
882 
883       if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
884           !export_clip_dists)
885          continue;
886 
887       radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
888                         usage_mask);
889    }
890 }
891 
892 /* Generate export instructions for hardware VS shader stage or NGG GS stage
893  * (position and parameter data only).
894  */
895 static void
radv_llvm_export_vs(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,const struct radv_vs_output_info * outinfo,bool export_clip_dists)896 radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
897                     unsigned noutput, const struct radv_vs_output_info *outinfo,
898                     bool export_clip_dists)
899 {
900    LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
901    LLVMValueRef primitive_shading_rate = NULL;
902    struct ac_export_args pos_args[4] = {0};
903    unsigned pos_idx, index;
904    int i;
905 
906    /* Build position exports */
907    for (i = 0; i < noutput; i++) {
908       switch (outputs[i].slot_name) {
909       case VARYING_SLOT_POS:
910          si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
911          break;
912       case VARYING_SLOT_PSIZ:
913          psize_value = outputs[i].values[0];
914          break;
915       case VARYING_SLOT_LAYER:
916          layer_value = outputs[i].values[0];
917          break;
918       case VARYING_SLOT_VIEWPORT:
919          viewport_value = outputs[i].values[0];
920          break;
921       case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
922          primitive_shading_rate = outputs[i].values[0];
923          break;
924       case VARYING_SLOT_CLIP_DIST0:
925       case VARYING_SLOT_CLIP_DIST1:
926          index = 2 + outputs[i].slot_index;
927          si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
928                                   &pos_args[index]);
929          break;
930       default:
931          break;
932       }
933    }
934 
935    /* We need to add the position output manually if it's missing. */
936    if (!pos_args[0].out[0]) {
937       pos_args[0].enabled_channels = 0xf; /* writemask */
938       pos_args[0].valid_mask = 0;         /* EXEC mask */
939       pos_args[0].done = 0;               /* last export? */
940       pos_args[0].target = V_008DFC_SQ_EXP_POS;
941       pos_args[0].compr = 0;              /* COMPR flag */
942       pos_args[0].out[0] = ctx->ac.f32_0; /* X */
943       pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
944       pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
945       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
946    }
947 
948    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
949        outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate) {
950       pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
951                                       (outinfo->writes_primitive_shading_rate == true ? 2 : 0) |
952                                       (outinfo->writes_layer == true ? 4 : 0));
953       pos_args[1].valid_mask = 0;
954       pos_args[1].done = 0;
955       pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
956       pos_args[1].compr = 0;
957       pos_args[1].out[0] = ctx->ac.f32_0; /* X */
958       pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
959       pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
960       pos_args[1].out[3] = ctx->ac.f32_0; /* W */
961 
962       if (outinfo->writes_pointsize == true)
963          pos_args[1].out[0] = psize_value;
964       if (outinfo->writes_layer == true)
965          pos_args[1].out[2] = layer_value;
966       if (outinfo->writes_viewport_index == true) {
967          if (ctx->options->gfx_level >= GFX9) {
968             /* GFX9 has the layer in out.z[10:0] and the viewport
969              * index in out.z[19:16].
970              */
971             LLVMValueRef v = viewport_value;
972             v = ac_to_integer(&ctx->ac, v);
973             v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
974             v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
975 
976             pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
977             pos_args[1].enabled_channels |= 1 << 2;
978          } else {
979             pos_args[1].out[3] = viewport_value;
980             pos_args[1].enabled_channels |= 1 << 3;
981          }
982       }
983 
984       if (outinfo->writes_primitive_shading_rate) {
985          pos_args[1].out[1] = primitive_shading_rate;
986       }
987    }
988 
989    /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
990     * Setting valid_mask=1 prevents it and has no other effect.
991     */
992    if (ctx->ac.gfx_level == GFX10)
993       pos_args[0].valid_mask = 1;
994 
995    pos_idx = 0;
996    for (i = 0; i < 4; i++) {
997       if (!pos_args[i].out[0])
998          continue;
999 
1000       /* Specify the target we are exporting */
1001       pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1002 
1003       if (pos_idx == outinfo->pos_exports)
1004          /* Specify that this is the last export */
1005          pos_args[i].done = 1;
1006 
1007       ac_build_export(&ctx->ac, &pos_args[i]);
1008    }
1009 
1010    /* Build parameter exports */
1011    radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1012 }
1013 
1014 static void
handle_vs_outputs_post(struct radv_shader_context * ctx,bool export_prim_id,bool export_clip_dists,const struct radv_vs_output_info * outinfo)1015 handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1016                        const struct radv_vs_output_info *outinfo)
1017 {
1018    struct radv_shader_output_values *outputs;
1019    unsigned noutput = 0;
1020 
1021    if (ctx->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader && ctx->stage != MESA_SHADER_GEOMETRY) {
1022       /* The GS copy shader emission already emits streamout. */
1023       radv_emit_streamout(ctx, 0);
1024    }
1025 
1026    /* Allocate a temporary array for the output values. */
1027    unsigned num_outputs = util_bitcount64(ctx->output_mask);
1028    outputs = malloc(num_outputs * sizeof(outputs[0]));
1029 
1030    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1031       if (!(ctx->output_mask & (1ull << i)))
1032          continue;
1033 
1034       outputs[noutput].slot_name = i;
1035       outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1036 
1037       if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1038          outputs[noutput].usage_mask = ctx->shader_info->vs.output_usage_mask[i];
1039       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1040          outputs[noutput].usage_mask = ctx->shader_info->tes.output_usage_mask[i];
1041       } else if (ctx->args->is_gs_copy_shader|| ctx->stage == MESA_SHADER_GEOMETRY) {
1042          outputs[noutput].usage_mask = ctx->shader_info->gs.output_usage_mask[i];
1043       }
1044 
1045       for (unsigned j = 0; j < 4; j++) {
1046          outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1047       }
1048 
1049       noutput++;
1050    }
1051 
1052    radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1053 
1054    free(outputs);
1055 }
1056 
1057 static bool
si_export_mrt_color(struct radv_shader_context * ctx,LLVMValueRef * color,unsigned index,struct ac_export_args * args)1058 si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
1059                     struct ac_export_args *args)
1060 {
1061    /* Export */
1062    si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
1063    if (!args->enabled_channels)
1064       return false; /* unnecessary NULL export */
1065 
1066    return true;
1067 }
1068 
1069 static void
radv_export_mrt_z(struct radv_shader_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask)1070 radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
1071                   LLVMValueRef samplemask)
1072 {
1073    struct ac_export_args args;
1074 
1075    ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, NULL, true, &args);
1076 
1077    ac_build_export(&ctx->ac, &args);
1078 }
1079 
1080 static void
handle_fs_outputs_post(struct radv_shader_context * ctx)1081 handle_fs_outputs_post(struct radv_shader_context *ctx)
1082 {
1083    unsigned index = 0;
1084    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
1085    struct ac_export_args color_args[8];
1086 
1087    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1088       LLVMValueRef values[4];
1089 
1090       if (!(ctx->output_mask & (1ull << i)))
1091          continue;
1092 
1093       if (i < FRAG_RESULT_DATA0)
1094          continue;
1095 
1096       for (unsigned j = 0; j < 4; j++)
1097          values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1098 
1099       bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
1100       if (ret)
1101          index++;
1102    }
1103 
1104    /* Process depth, stencil, samplemask. */
1105    if (ctx->shader_info->ps.writes_z) {
1106       depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
1107    }
1108    if (ctx->shader_info->ps.writes_stencil) {
1109       stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
1110    }
1111    if (ctx->shader_info->ps.writes_sample_mask) {
1112       samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
1113    }
1114 
1115    /* Set the DONE bit on last non-null color export only if Z isn't
1116     * exported.
1117     */
1118    if (index > 0 && !ctx->shader_info->ps.writes_z &&
1119        !ctx->shader_info->ps.writes_stencil &&
1120        !ctx->shader_info->ps.writes_sample_mask) {
1121       unsigned last = index - 1;
1122 
1123       color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
1124       color_args[last].done = 1;       /* DONE bit */
1125    }
1126 
1127    /* Export PS outputs. */
1128    for (unsigned i = 0; i < index; i++)
1129       ac_build_export(&ctx->ac, &color_args[i]);
1130 
1131    if (depth || stencil || samplemask)
1132       radv_export_mrt_z(ctx, depth, stencil, samplemask);
1133    else if (!index)
1134       ac_build_export_null(&ctx->ac, true);
1135 }
1136 
1137 static void
emit_gs_epilogue(struct radv_shader_context * ctx)1138 emit_gs_epilogue(struct radv_shader_context *ctx)
1139 {
1140    if (ctx->ac.gfx_level >= GFX10)
1141       ac_build_waitcnt(&ctx->ac, AC_WAIT_VSTORE);
1142 
1143    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
1144 }
1145 
1146 static void
handle_shader_outputs_post(struct ac_shader_abi * abi)1147 handle_shader_outputs_post(struct ac_shader_abi *abi)
1148 {
1149    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1150 
1151    switch (ctx->stage) {
1152    case MESA_SHADER_VERTEX:
1153       if (ctx->shader_info->vs.as_ls)
1154          break; /* Lowered in NIR */
1155       else if (ctx->shader_info->vs.as_es)
1156          break; /* Lowered in NIR */
1157       else if (ctx->shader_info->is_ngg)
1158          break; /* Lowered in NIR */
1159       else
1160          handle_vs_outputs_post(ctx, ctx->shader_info->vs.outinfo.export_prim_id,
1161                                 ctx->shader_info->vs.outinfo.export_clip_dists,
1162                                 &ctx->shader_info->vs.outinfo);
1163       break;
1164    case MESA_SHADER_FRAGMENT:
1165       handle_fs_outputs_post(ctx);
1166       break;
1167    case MESA_SHADER_GEOMETRY:
1168       if (ctx->shader_info->is_ngg)
1169          break; /* Lowered in NIR */
1170       else
1171          emit_gs_epilogue(ctx);
1172       break;
1173    case MESA_SHADER_TESS_CTRL:
1174       break; /* Lowered in NIR */
1175    case MESA_SHADER_TESS_EVAL:
1176       if (ctx->shader_info->tes.as_es)
1177          break; /* Lowered in NIR */
1178       else if (ctx->shader_info->is_ngg)
1179          break; /* Lowered in NIR */
1180       else
1181          handle_vs_outputs_post(ctx, ctx->shader_info->tes.outinfo.export_prim_id,
1182                                 ctx->shader_info->tes.outinfo.export_clip_dists,
1183                                 &ctx->shader_info->tes.outinfo);
1184       break;
1185    default:
1186       break;
1187    }
1188 }
1189 
1190 static void
ac_llvm_finalize_module(struct radv_shader_context * ctx,LLVMPassManagerRef passmgr)1191 ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr)
1192 {
1193    LLVMRunPassManager(passmgr, ctx->ac.module);
1194    LLVMDisposeBuilder(ctx->ac.builder);
1195 
1196    ac_llvm_context_dispose(&ctx->ac);
1197 }
1198 
1199 static void
radv_llvm_visit_export_vertex(struct ac_shader_abi * abi)1200 radv_llvm_visit_export_vertex(struct ac_shader_abi *abi)
1201 {
1202    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1203    const struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
1204                                          ? &ctx->shader_info->tes.outinfo
1205                                          : &ctx->shader_info->vs.outinfo;
1206 
1207    handle_vs_outputs_post(ctx, false,
1208                           outinfo->export_clip_dists,
1209                           outinfo);
1210 }
1211 
1212 static void
ac_setup_rings(struct radv_shader_context * ctx)1213 ac_setup_rings(struct radv_shader_context *ctx)
1214 {
1215    if (ctx->options->gfx_level <= GFX8 &&
1216        (ctx->stage == MESA_SHADER_GEOMETRY ||
1217         (ctx->stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_es) ||
1218         (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->shader_info->tes.as_es))) {
1219       unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
1220       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
1221 
1222       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
1223    }
1224 
1225    if (ctx->args->is_gs_copy_shader) {
1226       ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
1227                                                 LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
1228    }
1229 
1230    if (ctx->stage == MESA_SHADER_GEOMETRY) {
1231       /* The conceptual layout of the GSVS ring is
1232        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
1233        * but the real memory layout is swizzled across
1234        * threads:
1235        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
1236        *   t16v0c0 ..
1237        * Override the buffer descriptor accordingly.
1238        */
1239       LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
1240       uint64_t stream_offset = 0;
1241       unsigned num_records = ctx->ac.wave_size;
1242       LLVMValueRef base_ring;
1243 
1244       base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
1245                                         LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
1246 
1247       for (unsigned stream = 0; stream < 4; stream++) {
1248          unsigned num_components, stride;
1249          LLVMValueRef ring, tmp;
1250 
1251          num_components = ctx->shader_info->gs.num_stream_output_components[stream];
1252 
1253          if (!num_components)
1254             continue;
1255 
1256          stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
1257 
1258          /* Limit on the stride field for <= GFX7. */
1259          assert(stride < (1 << 14));
1260 
1261          ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
1262          tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
1263          tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
1264          ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
1265 
1266          stream_offset += stride * ctx->ac.wave_size;
1267 
1268          ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
1269 
1270          tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
1271          tmp = LLVMBuildOr(ctx->ac.builder, tmp,
1272                            LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
1273          ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
1274 
1275          ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
1276                                        LLVMConstInt(ctx->ac.i32, num_records, false),
1277                                        LLVMConstInt(ctx->ac.i32, 2, false), "");
1278 
1279          ctx->gsvs_ring[stream] = ring;
1280       }
1281    }
1282 
1283    if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
1284       ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
1285          &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
1286       ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
1287          &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
1288    }
1289 }
1290 
1291 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
1292 static void
ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context * ctx)1293 ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
1294 {
1295    LLVMValueRef count =
1296       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
1297    LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
1298    ctx->abi.instance_id =
1299       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
1300                       ctx->abi.instance_id, "");
1301    ctx->vs_rel_patch_id =
1302       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
1303                       ctx->vs_rel_patch_id, "");
1304    ctx->abi.vertex_id =
1305       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
1306                       ctx->abi.vertex_id, "");
1307 }
1308 
1309 static void
prepare_gs_input_vgprs(struct radv_shader_context * ctx,bool merged)1310 prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
1311 {
1312    if (merged) {
1313       ctx->gs_wave_id =
1314          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
1315    } else {
1316       ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
1317    }
1318 }
1319 
1320 /* Ensure that the esgs ring is declared.
1321  *
1322  * We declare it with 64KB alignment as a hint that the
1323  * pointer value will always be 0.
1324  */
1325 static void
declare_esgs_ring(struct radv_shader_context * ctx)1326 declare_esgs_ring(struct radv_shader_context *ctx)
1327 {
1328    if (ctx->esgs_ring)
1329       return;
1330 
1331    assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
1332 
1333    ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
1334                                                 "esgs_ring", AC_ADDR_SPACE_LDS);
1335    LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
1336    LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
1337 }
1338 
radv_intrinsic_load(struct ac_shader_abi * abi,nir_intrinsic_op op)1339 static LLVMValueRef radv_intrinsic_load(struct ac_shader_abi *abi, nir_intrinsic_op op)
1340 {
1341    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1342 
1343    switch (op) {
1344    case nir_intrinsic_load_base_vertex:
1345    case nir_intrinsic_load_first_vertex:
1346       return radv_load_base_vertex(abi, op == nir_intrinsic_load_base_vertex);
1347    case nir_intrinsic_load_ring_tess_factors_amd:
1348       return ctx->hs_ring_tess_factor;
1349    case nir_intrinsic_load_ring_tess_offchip_amd:
1350       return ctx->hs_ring_tess_offchip;
1351    case nir_intrinsic_load_ring_esgs_amd:
1352       return ctx->esgs_ring;
1353    default:
1354       return NULL;
1355    }
1356 }
1357 
1358 static LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct nir_shader * const * shaders,int shader_count,const struct radv_shader_args * args)1359 ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
1360                          const struct radv_nir_compiler_options *options,
1361                          const struct radv_shader_info *info,
1362                          struct nir_shader *const *shaders, int shader_count,
1363                          const struct radv_shader_args *args)
1364 {
1365    struct radv_shader_context ctx = {0};
1366    ctx.args = args;
1367    ctx.options = options;
1368    ctx.shader_info = info;
1369 
1370    enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
1371 
1372    if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
1373       float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
1374    }
1375 
1376    ac_llvm_context_init(&ctx.ac, ac_llvm, options->gfx_level, options->family,
1377                         options->has_3d_cube_border_color_mipmap,
1378                         float_mode, info->wave_size, info->ballot_bit_size);
1379    ctx.context = ctx.ac.context;
1380 
1381    ctx.max_workgroup_size = info->workgroup_size;
1382 
1383    create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
1384 
1385    ctx.abi.intrinsic_load = radv_intrinsic_load;
1386    ctx.abi.load_ubo = radv_load_ubo;
1387    ctx.abi.load_ssbo = radv_load_ssbo;
1388    ctx.abi.load_sampler_desc = radv_get_sampler_desc;
1389    ctx.abi.clamp_shadow_reference = false;
1390    ctx.abi.robust_buffer_access = options->robust_buffer_access;
1391    ctx.abi.load_grid_size_from_user_sgpr = args->load_grid_size_from_user_sgpr;
1392 
1393    bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
1394    if (shader_count >= 2 || is_ngg)
1395       ac_init_exec_full_mask(&ctx.ac);
1396 
1397    if (args->ac.vertex_id.used)
1398       ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
1399    if (args->ac.vs_rel_patch_id.used)
1400       ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
1401    if (args->ac.instance_id.used)
1402       ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
1403 
1404    if (options->has_ls_vgpr_init_bug &&
1405        shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
1406       ac_nir_fixup_ls_hs_input_vgprs(&ctx);
1407 
1408    if (is_ngg) {
1409       ctx.abi.export_vertex = radv_llvm_visit_export_vertex;
1410 
1411       if (!info->is_ngg_passthrough)
1412          declare_esgs_ring(&ctx);
1413 
1414       if (ctx.stage == MESA_SHADER_GEOMETRY) {
1415          /* Scratch space used by NGG GS for repacking vertices at the end. */
1416          LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
1417          LLVMValueRef gs_ngg_scratch =
1418             LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1419          LLVMSetInitializer(gs_ngg_scratch, LLVMGetUndef(ai32));
1420          LLVMSetLinkage(gs_ngg_scratch, LLVMExternalLinkage);
1421          LLVMSetAlignment(gs_ngg_scratch, 4);
1422 
1423          /* Vertex emit space used by NGG GS for storing all vertex attributes. */
1424          LLVMValueRef gs_ngg_emit =
1425             LLVMAddGlobalInAddressSpace(ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1426          LLVMSetInitializer(gs_ngg_emit, LLVMGetUndef(ai32));
1427          LLVMSetLinkage(gs_ngg_emit, LLVMExternalLinkage);
1428          LLVMSetAlignment(gs_ngg_emit, 4);
1429       }
1430 
1431       /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
1432       if (ctx.ac.gfx_level == GFX10 && shader_count == 1)
1433          ac_build_s_barrier(&ctx.ac, shaders[0]->info.stage);
1434    }
1435 
1436    for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
1437       ctx.stage = shaders[shader_idx]->info.stage;
1438       ctx.shader = shaders[shader_idx];
1439       ctx.output_mask = 0;
1440 
1441       if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && !ctx.shader_info->is_ngg) {
1442          ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
1443          ctx.abi.emit_primitive = visit_end_primitive;
1444       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
1445       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
1446          ctx.abi.load_inputs = radv_load_vs_inputs;
1447       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
1448          ctx.abi.load_sample_position = load_sample_position;
1449       }
1450 
1451       if (shader_idx && !(shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && info->is_ngg)) {
1452          /* Execute a barrier before the second shader in
1453           * a merged shader.
1454           *
1455           * Execute the barrier inside the conditional block,
1456           * so that empty waves can jump directly to s_endpgm,
1457           * which will also signal the barrier.
1458           *
1459           * This is possible in gfx9, because an empty wave
1460           * for the second shader does not participate in
1461           * the epilogue. With NGG, empty waves may still
1462           * be required to export data (e.g. GS output vertices),
1463           * so we cannot let them exit early.
1464           *
1465           * If the shader is TCS and the TCS epilog is present
1466           * and contains a barrier, it will wait there and then
1467           * reach s_endpgm.
1468           */
1469          ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
1470          ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
1471       }
1472 
1473       nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
1474          &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
1475 
1476       ac_setup_rings(&ctx);
1477 
1478       bool check_merged_wave_info = shader_count >= 2 && !(is_ngg && shader_idx == 1);
1479       LLVMBasicBlockRef merge_block = NULL;
1480 
1481       if (check_merged_wave_info) {
1482          LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
1483          LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
1484          merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
1485 
1486          LLVMValueRef count = ac_unpack_param(
1487             &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
1488          LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
1489          LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
1490          LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
1491 
1492          LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
1493       }
1494 
1495       if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
1496          prepare_interp_optimize(&ctx, shaders[shader_idx]);
1497       else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
1498          prepare_gs_input_vgprs(&ctx, shader_count >= 2);
1499 
1500       ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
1501 
1502       if (!gl_shader_stage_is_compute(shaders[shader_idx]->info.stage))
1503          handle_shader_outputs_post(&ctx.abi);
1504 
1505       if (check_merged_wave_info) {
1506          LLVMBuildBr(ctx.ac.builder, merge_block);
1507          LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
1508       }
1509    }
1510 
1511    LLVMBuildRetVoid(ctx.ac.builder);
1512 
1513    if (options->dump_preoptir) {
1514       fprintf(stderr, "%s LLVM IR:\n\n",
1515               radv_get_shader_name(info, shaders[shader_count - 1]->info.stage));
1516       ac_dump_module(ctx.ac.module);
1517       fprintf(stderr, "\n");
1518    }
1519 
1520    ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
1521 
1522    return ctx.ac.module;
1523 }
1524 
1525 static void
ac_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)1526 ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
1527 {
1528    unsigned *retval = (unsigned *)context;
1529    LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
1530    char *description = LLVMGetDiagInfoDescription(di);
1531 
1532    if (severity == LLVMDSError) {
1533       *retval = 1;
1534       fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
1535    }
1536 
1537    LLVMDisposeMessage(description);
1538 }
1539 
1540 static unsigned
radv_llvm_compile(LLVMModuleRef M,char ** pelf_buffer,size_t * pelf_size,struct ac_llvm_compiler * ac_llvm)1541 radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
1542                   struct ac_llvm_compiler *ac_llvm)
1543 {
1544    unsigned retval = 0;
1545    LLVMContextRef llvm_ctx;
1546 
1547    /* Setup Diagnostic Handler*/
1548    llvm_ctx = LLVMGetModuleContext(M);
1549 
1550    LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
1551 
1552    /* Compile IR*/
1553    if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
1554       retval = 1;
1555    return retval;
1556 }
1557 
1558 static void
ac_compile_llvm_module(struct ac_llvm_compiler * ac_llvm,LLVMModuleRef llvm_module,struct radv_shader_binary ** rbinary,gl_shader_stage stage,const char * name,const struct radv_nir_compiler_options * options)1559 ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
1560                        struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
1561                        const struct radv_nir_compiler_options *options)
1562 {
1563    char *elf_buffer = NULL;
1564    size_t elf_size = 0;
1565    char *llvm_ir_string = NULL;
1566 
1567    if (options->dump_shader) {
1568       fprintf(stderr, "%s LLVM IR:\n\n", name);
1569       ac_dump_module(llvm_module);
1570       fprintf(stderr, "\n");
1571    }
1572 
1573    if (options->record_ir) {
1574       char *llvm_ir = LLVMPrintModuleToString(llvm_module);
1575       llvm_ir_string = strdup(llvm_ir);
1576       LLVMDisposeMessage(llvm_ir);
1577    }
1578 
1579    int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
1580    if (v) {
1581       fprintf(stderr, "compile failed\n");
1582    }
1583 
1584    LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
1585    LLVMDisposeModule(llvm_module);
1586    LLVMContextDispose(ctx);
1587 
1588    size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
1589    size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
1590    struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
1591    memcpy(rbin->data, elf_buffer, elf_size);
1592    if (llvm_ir_string)
1593       memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
1594 
1595    rbin->base.type = RADV_BINARY_TYPE_RTLD;
1596    rbin->base.stage = stage;
1597    rbin->base.total_size = alloc_size;
1598    rbin->elf_size = elf_size;
1599    rbin->llvm_ir_size = llvm_ir_size;
1600    *rbinary = &rbin->base;
1601 
1602    free(llvm_ir_string);
1603    free(elf_buffer);
1604 }
1605 
1606 static void
radv_compile_nir_shader(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args,struct nir_shader * const * nir,int nir_count)1607 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
1608                         const struct radv_nir_compiler_options *options,
1609                         const struct radv_shader_info *info,
1610                         struct radv_shader_binary **rbinary,
1611                         const struct radv_shader_args *args, struct nir_shader *const *nir,
1612                         int nir_count)
1613 {
1614 
1615    LLVMModuleRef llvm_module;
1616 
1617    llvm_module = ac_translate_nir_to_llvm(ac_llvm, options, info, nir, nir_count, args);
1618 
1619    ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
1620                           radv_get_shader_name(info, nir[nir_count - 1]->info.stage),
1621                           options);
1622 }
1623 
1624 static void
ac_gs_copy_shader_emit(struct radv_shader_context * ctx)1625 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
1626 {
1627    LLVMValueRef vtx_offset =
1628       LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
1629                    LLVMConstInt(ctx->ac.i32, 4, false), "");
1630    LLVMValueRef stream_id;
1631 
1632    /* Fetch the vertex stream ID. */
1633    if (ctx->shader_info->so.num_outputs) {
1634       stream_id =
1635          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
1636    } else {
1637       stream_id = ctx->ac.i32_0;
1638    }
1639 
1640    LLVMBasicBlockRef end_bb;
1641    LLVMValueRef switch_inst;
1642 
1643    end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
1644    switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
1645 
1646    for (unsigned stream = 0; stream < 4; stream++) {
1647       unsigned num_components = ctx->shader_info->gs.num_stream_output_components[stream];
1648       LLVMBasicBlockRef bb;
1649       unsigned offset;
1650 
1651       if (stream > 0 && !num_components)
1652          continue;
1653 
1654       if (stream > 0 && !ctx->shader_info->so.num_outputs)
1655          continue;
1656 
1657       bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
1658       LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
1659       LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
1660 
1661       offset = 0;
1662       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1663          unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
1664          unsigned output_stream = ctx->shader_info->gs.output_streams[i];
1665          int length = util_last_bit(output_usage_mask);
1666 
1667          if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
1668             continue;
1669 
1670          for (unsigned j = 0; j < length; j++) {
1671             LLVMValueRef value, soffset;
1672 
1673             if (!(output_usage_mask & (1 << j)))
1674                continue;
1675 
1676             soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
1677                                    false);
1678 
1679             offset++;
1680 
1681             value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
1682                                          soffset, ctx->ac.f32, ac_glc | ac_slc, true, false);
1683 
1684             LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1685             if (ac_get_type_size(type) == 2) {
1686                value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
1687                value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
1688             }
1689 
1690             LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
1691                            ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1692          }
1693       }
1694 
1695       if (ctx->shader_info->so.num_outputs)
1696          radv_emit_streamout(ctx, stream);
1697 
1698       if (stream == 0) {
1699          handle_vs_outputs_post(ctx, false, ctx->shader_info->vs.outinfo.export_clip_dists,
1700                                 &ctx->shader_info->vs.outinfo);
1701       }
1702 
1703       LLVMBuildBr(ctx->ac.builder, end_bb);
1704    }
1705 
1706    LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
1707 }
1708 
1709 static void
radv_compile_gs_copy_shader(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct nir_shader * geom_shader,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args)1710 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
1711                             const struct radv_nir_compiler_options *options,
1712                             const struct radv_shader_info *info,
1713                             struct nir_shader *geom_shader,
1714                             struct radv_shader_binary **rbinary,
1715                             const struct radv_shader_args *args)
1716 {
1717    struct radv_shader_context ctx = {0};
1718    ctx.args = args;
1719    ctx.options = options;
1720    ctx.shader_info = info;
1721 
1722    assert(args->is_gs_copy_shader);
1723 
1724    ac_llvm_context_init(&ctx.ac, ac_llvm, options->gfx_level, options->family,
1725                         options->has_3d_cube_border_color_mipmap,
1726                         AC_FLOAT_MODE_DEFAULT, 64, 64);
1727    ctx.context = ctx.ac.context;
1728 
1729    ctx.stage = MESA_SHADER_VERTEX;
1730    ctx.shader = geom_shader;
1731 
1732    create_function(&ctx, MESA_SHADER_VERTEX, false);
1733 
1734    ac_setup_rings(&ctx);
1735 
1736    nir_foreach_shader_out_variable(variable, geom_shader)
1737    {
1738       scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
1739       ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
1740    }
1741 
1742    ac_gs_copy_shader_emit(&ctx);
1743 
1744    LLVMBuildRetVoid(ctx.ac.builder);
1745 
1746    ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
1747 
1748    ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
1749                           options);
1750    (*rbinary)->is_gs_copy_shader = true;
1751 }
1752 
1753 void
llvm_compile_shader(const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,unsigned shader_count,struct nir_shader * const * shaders,struct radv_shader_binary ** binary,const struct radv_shader_args * args)1754 llvm_compile_shader(const struct radv_nir_compiler_options *options,
1755                     const struct radv_shader_info *info, unsigned shader_count,
1756                     struct nir_shader *const *shaders, struct radv_shader_binary **binary,
1757                     const struct radv_shader_args *args)
1758 {
1759    enum ac_target_machine_options tm_options = 0;
1760    struct ac_llvm_compiler ac_llvm;
1761 
1762    tm_options |= AC_TM_SUPPORTS_SPILL;
1763    if (options->check_ir)
1764       tm_options |= AC_TM_CHECK_IR;
1765 
1766    radv_init_llvm_compiler(&ac_llvm, options->family, tm_options, info->wave_size);
1767 
1768    if (args->is_gs_copy_shader) {
1769       radv_compile_gs_copy_shader(&ac_llvm, options, info, *shaders, binary, args);
1770    } else {
1771       radv_compile_nir_shader(&ac_llvm, options, info, binary, args, shaders, shader_count);
1772    }
1773 }
1774