1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "nir/nir.h"
29 #include "radv_debug.h"
30 #include "radv_llvm_helper.h"
31 #include "radv_private.h"
32 #include "radv_shader.h"
33 #include "radv_shader_args.h"
34
35 #include "ac_binary.h"
36 #include "ac_nir.h"
37 #include "ac_llvm_build.h"
38 #include "ac_nir_to_llvm.h"
39 #include "ac_shader_abi.h"
40 #include "ac_shader_util.h"
41 #include "sid.h"
42
43 struct radv_shader_context {
44 struct ac_llvm_context ac;
45 const struct nir_shader *shader;
46 struct ac_shader_abi abi;
47 const struct radv_nir_compiler_options *options;
48 const struct radv_shader_info *shader_info;
49 const struct radv_shader_args *args;
50
51 gl_shader_stage stage;
52
53 unsigned max_workgroup_size;
54 LLVMContextRef context;
55 LLVMValueRef main_function;
56
57 LLVMValueRef descriptor_sets[MAX_SETS];
58
59 LLVMValueRef ring_offsets;
60
61 LLVMValueRef vs_rel_patch_id;
62
63 LLVMValueRef gs_wave_id;
64
65 LLVMValueRef esgs_ring;
66 LLVMValueRef gsvs_ring[4];
67 LLVMValueRef hs_ring_tess_offchip;
68 LLVMValueRef hs_ring_tess_factor;
69
70 uint64_t output_mask;
71 };
72
73 struct radv_shader_output_values {
74 LLVMValueRef values[4];
75 unsigned slot_name;
76 unsigned slot_index;
77 unsigned usage_mask;
78 };
79
80 static inline struct radv_shader_context *
radv_shader_context_from_abi(struct ac_shader_abi * abi)81 radv_shader_context_from_abi(struct ac_shader_abi *abi)
82 {
83 return container_of(abi, struct radv_shader_context, abi);
84 }
85
86 static LLVMValueRef
create_llvm_function(struct ac_llvm_context * ctx,LLVMModuleRef module,LLVMBuilderRef builder,const struct ac_shader_args * args,enum ac_llvm_calling_convention convention,unsigned max_workgroup_size,const struct radv_nir_compiler_options * options)87 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
88 const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
89 unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
90 {
91 LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
92
93 if (options->address32_hi) {
94 ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
95 options->address32_hi);
96 }
97
98 ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
99 ac_llvm_set_target_features(main_function, ctx);
100
101 return main_function;
102 }
103
104 static void
load_descriptor_sets(struct radv_shader_context * ctx)105 load_descriptor_sets(struct radv_shader_context *ctx)
106 {
107 const struct radv_userdata_locations *user_sgprs_locs = &ctx->shader_info->user_sgprs_locs;
108 uint32_t mask = ctx->shader_info->desc_set_used_mask;
109
110 if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
111 LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
112 while (mask) {
113 int i = u_bit_scan(&mask);
114
115 ctx->descriptor_sets[i] =
116 ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
117 LLVMSetAlignment(ctx->descriptor_sets[i], 4);
118 }
119 } else {
120 while (mask) {
121 int i = u_bit_scan(&mask);
122
123 ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
124 }
125 }
126 }
127
128 static enum ac_llvm_calling_convention
get_llvm_calling_convention(LLVMValueRef func,gl_shader_stage stage)129 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
130 {
131 switch (stage) {
132 case MESA_SHADER_VERTEX:
133 case MESA_SHADER_TESS_EVAL:
134 return AC_LLVM_AMDGPU_VS;
135 break;
136 case MESA_SHADER_GEOMETRY:
137 return AC_LLVM_AMDGPU_GS;
138 break;
139 case MESA_SHADER_TESS_CTRL:
140 return AC_LLVM_AMDGPU_HS;
141 break;
142 case MESA_SHADER_FRAGMENT:
143 return AC_LLVM_AMDGPU_PS;
144 break;
145 case MESA_SHADER_COMPUTE:
146 return AC_LLVM_AMDGPU_CS;
147 break;
148 default:
149 unreachable("Unhandle shader type");
150 }
151 }
152
153 /* Returns whether the stage is a stage that can be directly before the GS */
154 static bool
is_pre_gs_stage(gl_shader_stage stage)155 is_pre_gs_stage(gl_shader_stage stage)
156 {
157 return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
158 }
159
160 static void
create_function(struct radv_shader_context * ctx,gl_shader_stage stage,bool has_previous_stage)161 create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
162 {
163 if (ctx->ac.gfx_level >= GFX10) {
164 if (is_pre_gs_stage(stage) && ctx->shader_info->is_ngg) {
165 /* On GFX10+, VS and TES are merged into GS for NGG. */
166 stage = MESA_SHADER_GEOMETRY;
167 has_previous_stage = true;
168 }
169 }
170
171 ctx->main_function =
172 create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
173 get_llvm_calling_convention(ctx->main_function, stage),
174 ctx->max_workgroup_size, ctx->options);
175
176 ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
177 LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
178 AC_FUNC_ATTR_READNONE);
179 ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
180 ac_array_in_const_addr_space(ctx->ac.v4i32), "");
181
182 load_descriptor_sets(ctx);
183
184 if (stage == MESA_SHADER_TESS_CTRL ||
185 (stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_ls) ||
186 ctx->shader_info->is_ngg ||
187 /* GFX9 has the ESGS ring buffer in LDS. */
188 (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
189 ac_declare_lds_as_pointer(&ctx->ac);
190 }
191 }
192
193 static uint32_t
radv_get_sample_pos_offset(uint32_t num_samples)194 radv_get_sample_pos_offset(uint32_t num_samples)
195 {
196 uint32_t sample_pos_offset = 0;
197
198 switch (num_samples) {
199 case 2:
200 sample_pos_offset = 1;
201 break;
202 case 4:
203 sample_pos_offset = 3;
204 break;
205 case 8:
206 sample_pos_offset = 7;
207 break;
208 default:
209 break;
210 }
211 return sample_pos_offset;
212 }
213
214 static LLVMValueRef
load_sample_position(struct ac_shader_abi * abi,LLVMValueRef sample_id)215 load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
216 {
217 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
218
219 LLVMValueRef result;
220 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
221 LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
222
223 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
224
225 uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->options->key.ps.num_samples);
226
227 sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
228 LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
229 result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
230
231 return result;
232 }
233
234 static void
visit_emit_vertex_with_counter(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)235 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
236 LLVMValueRef *addrs)
237 {
238 unsigned offset = 0;
239 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
240
241 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
242 unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
243 uint8_t output_stream = ctx->shader_info->gs.output_streams[i];
244 LLVMValueRef *out_ptr = &addrs[i * 4];
245 int length = util_last_bit(output_usage_mask);
246
247 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
248 continue;
249
250 for (unsigned j = 0; j < length; j++) {
251 if (!(output_usage_mask & (1 << j)))
252 continue;
253
254 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
255 LLVMValueRef voffset =
256 LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
257
258 offset++;
259
260 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
261 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
262
263 out_val = ac_to_integer(&ctx->ac, out_val);
264 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
265
266 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, NULL, voffset,
267 ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset),
268 ac_glc | ac_slc | ac_swizzled);
269 }
270 }
271
272 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
273 ctx->gs_wave_id);
274 }
275
276 static void
visit_end_primitive(struct ac_shader_abi * abi,unsigned stream)277 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
278 {
279 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
280 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
281 ctx->gs_wave_id);
282 }
283
284 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)285 radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
286 {
287 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
288 return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
289 }
290
291 static LLVMValueRef
radv_load_rsrc(struct radv_shader_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)292 radv_load_rsrc(struct radv_shader_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
293 {
294 if (ptr && LLVMTypeOf(ptr) == ctx->ac.i32) {
295 LLVMValueRef result;
296
297 LLVMTypeRef ptr_type = LLVMPointerType(type, AC_ADDR_SPACE_CONST_32BIT);
298 ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, ptr_type, "");
299 LLVMSetMetadata(ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
300
301 result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
302 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
303
304 return result;
305 }
306
307 return ptr;
308 }
309
310 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr)311 radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
312 {
313 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
314 return radv_load_rsrc(ctx, buffer_ptr, ctx->ac.v4i32);
315 }
316
317 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr,bool write,bool non_uniform)318 radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
319 {
320 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
321 return radv_load_rsrc(ctx, buffer_ptr, ctx->ac.v4i32);
322 }
323
324 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi * abi,unsigned descriptor_set,unsigned base_index,unsigned constant_index,LLVMValueRef index,enum ac_descriptor_type desc_type,bool image,bool write,bool bindless)325 radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
326 unsigned constant_index, LLVMValueRef index,
327 enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
328 {
329 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
330
331 if (image && desc_type == AC_DESC_FMASK)
332 return NULL;
333
334 /* 3 plane formats always have same size and format for plane 1 & 2, so
335 * use the tail from plane 1 so that we can store only the first 16 bytes
336 * of the last plane. */
337 if (desc_type == AC_DESC_PLANE_2 && index && LLVMTypeOf(index) == ctx->ac.i32) {
338 LLVMValueRef plane1_addr =
339 LLVMBuildSub(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 32, false), "");
340 LLVMValueRef descriptor1 = radv_load_rsrc(ctx, plane1_addr, ctx->ac.v8i32);
341 LLVMValueRef descriptor2 = radv_load_rsrc(ctx, index, ctx->ac.v4i32);
342
343 LLVMValueRef components[8];
344 for (unsigned i = 0; i < 4; ++i)
345 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
346
347 for (unsigned i = 4; i < 8; ++i)
348 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor1, i);
349 return ac_build_gather_values(&ctx->ac, components, 8);
350 }
351
352 bool v4 = desc_type == AC_DESC_BUFFER || desc_type == AC_DESC_SAMPLER;
353 return radv_load_rsrc(ctx, index, v4 ? ctx->ac.v4i32 : ctx->ac.v8i32);
354 }
355
356 static LLVMValueRef
radv_fixup_vertex_input_fetches(struct radv_shader_context * ctx,LLVMValueRef value,unsigned num_channels,bool is_float)357 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
358 unsigned num_channels, bool is_float)
359 {
360 LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
361 LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
362 LLVMValueRef chan[4];
363
364 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
365 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
366
367 if (num_channels == 4 && num_channels == vec_size)
368 return value;
369
370 num_channels = MIN2(num_channels, vec_size);
371
372 for (unsigned i = 0; i < num_channels; i++)
373 chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
374 } else {
375 assert(num_channels == 1);
376 chan[0] = value;
377 }
378
379 for (unsigned i = num_channels; i < 4; i++) {
380 chan[i] = i == 3 ? one : zero;
381 chan[i] = ac_to_integer(&ctx->ac, chan[i]);
382 }
383
384 return ac_build_gather_values(&ctx->ac, chan, 4);
385 }
386
387 static void
load_vs_input(struct radv_shader_context * ctx,unsigned driver_location,LLVMTypeRef dest_type,LLVMValueRef out[4])388 load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
389 LLVMValueRef out[4])
390 {
391 LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
392 LLVMValueRef t_offset;
393 LLVMValueRef t_list;
394 LLVMValueRef input;
395 LLVMValueRef buffer_index;
396 unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
397 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
398 unsigned data_format = attrib_format & 0x0f;
399 unsigned num_format = (attrib_format >> 4) & 0x07;
400 bool is_float =
401 num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
402 uint8_t input_usage_mask =
403 ctx->shader_info->vs.input_usage_mask[driver_location];
404 unsigned num_input_channels = util_last_bit(input_usage_mask);
405
406 if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
407 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
408
409 if (divisor) {
410 buffer_index = ctx->abi.instance_id;
411
412 if (divisor != 1) {
413 buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
414 LLVMConstInt(ctx->ac.i32, divisor, 0), "");
415 }
416 } else {
417 buffer_index = ctx->ac.i32_0;
418 }
419
420 buffer_index = LLVMBuildAdd(
421 ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
422 } else {
423 buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
424 ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
425 }
426
427 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
428
429 /* Adjust the number of channels to load based on the vertex attribute format. */
430 unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
431 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
432 unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
433 unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
434
435 unsigned desc_index =
436 ctx->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
437 desc_index = util_bitcount(ctx->shader_info->vs.vb_desc_usage_mask &
438 u_bit_consecutive(0, desc_index));
439 t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
440 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
441
442 /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
443 * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
444 * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
445 * VBO offset is 2 for R16G16B16A16_SNORM).
446 */
447 if (ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10) {
448 unsigned chan_format = vtx_info->chan_format;
449 LLVMValueRef values[4];
450
451 assert(ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10);
452
453 for (unsigned chan = 0; chan < num_channels; chan++) {
454 unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
455 LLVMValueRef chan_index = buffer_index;
456
457 if (attrib_stride != 0 && chan_offset > attrib_stride) {
458 LLVMValueRef buffer_offset =
459 LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
460
461 chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
462
463 chan_offset = chan_offset % attrib_stride;
464 }
465
466 values[chan] = ac_build_struct_tbuffer_load(
467 &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
468 ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
469 }
470
471 input = ac_build_gather_values(&ctx->ac, values, num_channels);
472 } else {
473 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
474 LLVMValueRef buffer_offset =
475 LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
476
477 buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
478
479 attrib_offset = attrib_offset % attrib_stride;
480 }
481
482 input = ac_build_struct_tbuffer_load(
483 &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
484 ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
485 }
486
487 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
488
489 for (unsigned chan = 0; chan < 4; chan++) {
490 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
491 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
492 if (dest_type == ctx->ac.i16 && is_float) {
493 out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
494 out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
495 }
496 }
497
498 for (unsigned chan = 0; chan < 4; chan++) {
499 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
500 if (dest_type == ctx->ac.i16 && !is_float)
501 out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
502 }
503 }
504
505 static LLVMValueRef
radv_load_vs_inputs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)506 radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
507 unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
508 {
509 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
510 LLVMValueRef values[4];
511
512 load_vs_input(ctx, driver_location, type, values);
513
514 for (unsigned i = 0; i < 4; i++)
515 values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
516
517 return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
518 }
519
520 static void
prepare_interp_optimize(struct radv_shader_context * ctx,struct nir_shader * nir)521 prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
522 {
523 bool uses_center = false;
524 bool uses_centroid = false;
525 nir_foreach_shader_in_variable (variable, nir) {
526 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
527 variable->data.sample)
528 continue;
529
530 if (variable->data.centroid)
531 uses_centroid = true;
532 else
533 uses_center = true;
534 }
535
536 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
537 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
538
539 if (uses_center && uses_centroid) {
540 LLVMValueRef sel =
541 LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
542 ctx->ac.i32_0, "");
543 ctx->abi.persp_centroid =
544 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
545 ctx->abi.persp_centroid, "");
546 ctx->abi.linear_centroid =
547 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
548 ctx->abi.linear_centroid, "");
549 }
550 }
551
552 static void
scan_shader_output_decl(struct radv_shader_context * ctx,struct nir_variable * variable,struct nir_shader * shader,gl_shader_stage stage)553 scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
554 struct nir_shader *shader, gl_shader_stage stage)
555 {
556 int idx = variable->data.driver_location;
557 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
558 uint64_t mask_attribs;
559
560 if (variable->data.compact) {
561 unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
562 attrib_count = (component_count + 3) / 4;
563 }
564
565 mask_attribs = ((1ull << attrib_count) - 1) << idx;
566
567 ctx->output_mask |= mask_attribs;
568 }
569
570 /* Initialize arguments for the shader export intrinsic */
571 static void
si_llvm_init_export_args(struct radv_shader_context * ctx,LLVMValueRef * values,unsigned enabled_channels,unsigned target,struct ac_export_args * args)572 si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
573 unsigned enabled_channels, unsigned target, struct ac_export_args *args)
574 {
575 /* Specify the channels that are enabled. */
576 args->enabled_channels = enabled_channels;
577
578 /* Specify whether the EXEC mask represents the valid mask */
579 args->valid_mask = 0;
580
581 /* Specify whether this is the last export */
582 args->done = 0;
583
584 /* Specify the target we are exporting */
585 args->target = target;
586
587 args->compr = false;
588 args->out[0] = LLVMGetUndef(ctx->ac.f32);
589 args->out[1] = LLVMGetUndef(ctx->ac.f32);
590 args->out[2] = LLVMGetUndef(ctx->ac.f32);
591 args->out[3] = LLVMGetUndef(ctx->ac.f32);
592
593 if (!values)
594 return;
595
596 bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
597 if (ctx->stage == MESA_SHADER_FRAGMENT) {
598 unsigned index = target - V_008DFC_SQ_EXP_MRT;
599 unsigned col_format = (ctx->options->key.ps.col_format >> (4 * index)) & 0xf;
600 bool is_int8 = (ctx->options->key.ps.is_int8 >> index) & 1;
601 bool is_int10 = (ctx->options->key.ps.is_int10 >> index) & 1;
602 bool enable_mrt_output_nan_fixup = (ctx->options->key.ps.enable_mrt_output_nan_fixup >> index) & 1;
603
604 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
605 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
606 bool hi) = NULL;
607
608 switch (col_format) {
609 case V_028714_SPI_SHADER_ZERO:
610 args->enabled_channels = 0; /* writemask */
611 args->target = V_008DFC_SQ_EXP_NULL;
612 break;
613
614 case V_028714_SPI_SHADER_32_R:
615 args->enabled_channels = 1;
616 args->out[0] = values[0];
617 break;
618
619 case V_028714_SPI_SHADER_32_GR:
620 args->enabled_channels = 0x3;
621 args->out[0] = values[0];
622 args->out[1] = values[1];
623 break;
624
625 case V_028714_SPI_SHADER_32_AR:
626 if (ctx->ac.gfx_level >= GFX10) {
627 args->enabled_channels = 0x3;
628 args->out[0] = values[0];
629 args->out[1] = values[3];
630 } else {
631 args->enabled_channels = 0x9;
632 args->out[0] = values[0];
633 args->out[3] = values[3];
634 }
635 break;
636
637 case V_028714_SPI_SHADER_FP16_ABGR:
638 args->enabled_channels = 0xf;
639 packf = ac_build_cvt_pkrtz_f16;
640 if (is_16bit) {
641 for (unsigned chan = 0; chan < 4; chan++)
642 values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
643 }
644 break;
645
646 case V_028714_SPI_SHADER_UNORM16_ABGR:
647 args->enabled_channels = 0xf;
648 packf = ac_build_cvt_pknorm_u16;
649 break;
650
651 case V_028714_SPI_SHADER_SNORM16_ABGR:
652 args->enabled_channels = 0xf;
653 packf = ac_build_cvt_pknorm_i16;
654 break;
655
656 case V_028714_SPI_SHADER_UINT16_ABGR:
657 args->enabled_channels = 0xf;
658 packi = ac_build_cvt_pk_u16;
659 if (is_16bit) {
660 for (unsigned chan = 0; chan < 4; chan++)
661 values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
662 ctx->ac.i32, "");
663 }
664 break;
665
666 case V_028714_SPI_SHADER_SINT16_ABGR:
667 args->enabled_channels = 0xf;
668 packi = ac_build_cvt_pk_i16;
669 if (is_16bit) {
670 for (unsigned chan = 0; chan < 4; chan++)
671 values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
672 ctx->ac.i32, "");
673 }
674 break;
675
676 default:
677 case V_028714_SPI_SHADER_32_ABGR:
678 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
679 break;
680 }
681
682 /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
683 if (enable_mrt_output_nan_fixup && !is_16bit) {
684 for (unsigned i = 0; i < 4; i++) {
685 LLVMValueRef class_args[2] = {values[i],
686 LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
687 LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
688 class_args, 2, AC_FUNC_ATTR_READNONE);
689 values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
690 }
691 }
692
693 /* Pack f16 or norm_i16/u16. */
694 if (packf) {
695 for (unsigned chan = 0; chan < 2; chan++) {
696 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
697 LLVMValueRef packed;
698
699 packed = packf(&ctx->ac, pack_args);
700 args->out[chan] = ac_to_float(&ctx->ac, packed);
701 }
702 args->compr = 1; /* COMPR flag */
703 }
704
705 /* Pack i16/u16. */
706 if (packi) {
707 for (unsigned chan = 0; chan < 2; chan++) {
708 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
709 ac_to_integer(&ctx->ac, values[2 * chan + 1])};
710 LLVMValueRef packed;
711
712 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
713 args->out[chan] = ac_to_float(&ctx->ac, packed);
714 }
715 args->compr = 1; /* COMPR flag */
716 }
717 return;
718 }
719
720 if (is_16bit) {
721 for (unsigned chan = 0; chan < 4; chan++) {
722 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
723 args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
724 }
725 } else
726 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
727
728 for (unsigned i = 0; i < 4; ++i)
729 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
730 }
731
732 static void
radv_export_param(struct radv_shader_context * ctx,unsigned index,LLVMValueRef * values,unsigned enabled_channels)733 radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
734 unsigned enabled_channels)
735 {
736 struct ac_export_args args;
737
738 si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
739 ac_build_export(&ctx->ac, &args);
740 }
741
742 static LLVMValueRef
radv_load_output(struct radv_shader_context * ctx,unsigned index,unsigned chan)743 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
744 {
745 LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
746 return LLVMBuildLoad(ctx->ac.builder, output, "");
747 }
748
749 static void
radv_emit_stream_output(struct radv_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,const struct radv_stream_output * output,struct radv_shader_output_values * shader_out)750 radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
751 LLVMValueRef const *so_write_offsets,
752 const struct radv_stream_output *output,
753 struct radv_shader_output_values *shader_out)
754 {
755 unsigned num_comps = util_bitcount(output->component_mask);
756 unsigned buf = output->buffer;
757 unsigned offset = output->offset;
758 unsigned start;
759 LLVMValueRef out[4];
760
761 assert(num_comps && num_comps <= 4);
762 if (!num_comps || num_comps > 4)
763 return;
764
765 /* Get the first component. */
766 start = ffs(output->component_mask) - 1;
767
768 /* Load the output as int. */
769 for (int i = 0; i < num_comps; i++) {
770 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
771 }
772
773 /* Pack the output. */
774 LLVMValueRef vdata = NULL;
775
776 switch (num_comps) {
777 case 1: /* as i32 */
778 vdata = out[0];
779 break;
780 case 2: /* as v2i32 */
781 case 3: /* as v3i32 */
782 case 4: /* as v4i32 */
783 vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
784 break;
785 }
786
787 LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf],
788 LLVMConstInt(ctx->ac.i32, offset, 0), "");
789 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, NULL, voffset, ctx->ac.i32_0,
790 ac_glc | ac_slc);
791 }
792
793 static void
radv_emit_streamout(struct radv_shader_context * ctx,unsigned stream)794 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
795 {
796 int i;
797
798 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
799 assert(ctx->args->ac.streamout_config.used);
800 LLVMValueRef so_vtx_count = ac_build_bfe(
801 &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
802 LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
803
804 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
805
806 /* can_emit = tid < so_vtx_count; */
807 LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
808
809 /* Emit the streamout code conditionally. This actually avoids
810 * out-of-bounds buffer access. The hw tells us via the SGPR
811 * (so_vtx_count) which threads are allowed to emit streamout data.
812 */
813 ac_build_ifcc(&ctx->ac, can_emit, 6501);
814 {
815 /* The buffer offset is computed as follows:
816 * ByteOffset = streamout_offset[buffer_id]*4 +
817 * (streamout_write_index + thread_id)*stride[buffer_id] +
818 * attrib_offset
819 */
820 LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
821
822 /* Compute (streamout_write_index + thread_id). */
823 so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
824
825 /* Load the descriptor and compute the write offset for each
826 * enabled buffer.
827 */
828 LLVMValueRef so_write_offset[4] = {0};
829 LLVMValueRef so_buffers[4] = {0};
830 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
831
832 for (i = 0; i < 4; i++) {
833 uint16_t stride = ctx->shader_info->so.strides[i];
834
835 if (!stride)
836 continue;
837
838 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
839
840 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
841
842 LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
843
844 so_offset =
845 LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
846
847 so_write_offset[i] = ac_build_imad(
848 &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
849 }
850
851 /* Write streamout data. */
852 for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
853 struct radv_shader_output_values shader_out = {0};
854 const struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
855
856 if (stream != output->stream)
857 continue;
858
859 for (int j = 0; j < 4; j++) {
860 shader_out.values[j] = radv_load_output(ctx, output->location, j);
861 }
862
863 radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
864 }
865 }
866 ac_build_endif(&ctx->ac, 6501);
867 }
868
869 static void
radv_build_param_exports(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,const struct radv_vs_output_info * outinfo,bool export_clip_dists)870 radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
871 unsigned noutput, const struct radv_vs_output_info *outinfo,
872 bool export_clip_dists)
873 {
874 for (unsigned i = 0; i < noutput; i++) {
875 unsigned slot_name = outputs[i].slot_name;
876 unsigned usage_mask = outputs[i].usage_mask;
877
878 if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
879 slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
880 slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
881 continue;
882
883 if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
884 !export_clip_dists)
885 continue;
886
887 radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
888 usage_mask);
889 }
890 }
891
892 /* Generate export instructions for hardware VS shader stage or NGG GS stage
893 * (position and parameter data only).
894 */
895 static void
radv_llvm_export_vs(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,const struct radv_vs_output_info * outinfo,bool export_clip_dists)896 radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
897 unsigned noutput, const struct radv_vs_output_info *outinfo,
898 bool export_clip_dists)
899 {
900 LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
901 LLVMValueRef primitive_shading_rate = NULL;
902 struct ac_export_args pos_args[4] = {0};
903 unsigned pos_idx, index;
904 int i;
905
906 /* Build position exports */
907 for (i = 0; i < noutput; i++) {
908 switch (outputs[i].slot_name) {
909 case VARYING_SLOT_POS:
910 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
911 break;
912 case VARYING_SLOT_PSIZ:
913 psize_value = outputs[i].values[0];
914 break;
915 case VARYING_SLOT_LAYER:
916 layer_value = outputs[i].values[0];
917 break;
918 case VARYING_SLOT_VIEWPORT:
919 viewport_value = outputs[i].values[0];
920 break;
921 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
922 primitive_shading_rate = outputs[i].values[0];
923 break;
924 case VARYING_SLOT_CLIP_DIST0:
925 case VARYING_SLOT_CLIP_DIST1:
926 index = 2 + outputs[i].slot_index;
927 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
928 &pos_args[index]);
929 break;
930 default:
931 break;
932 }
933 }
934
935 /* We need to add the position output manually if it's missing. */
936 if (!pos_args[0].out[0]) {
937 pos_args[0].enabled_channels = 0xf; /* writemask */
938 pos_args[0].valid_mask = 0; /* EXEC mask */
939 pos_args[0].done = 0; /* last export? */
940 pos_args[0].target = V_008DFC_SQ_EXP_POS;
941 pos_args[0].compr = 0; /* COMPR flag */
942 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
943 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
944 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
945 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
946 }
947
948 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
949 outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate) {
950 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
951 (outinfo->writes_primitive_shading_rate == true ? 2 : 0) |
952 (outinfo->writes_layer == true ? 4 : 0));
953 pos_args[1].valid_mask = 0;
954 pos_args[1].done = 0;
955 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
956 pos_args[1].compr = 0;
957 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
958 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
959 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
960 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
961
962 if (outinfo->writes_pointsize == true)
963 pos_args[1].out[0] = psize_value;
964 if (outinfo->writes_layer == true)
965 pos_args[1].out[2] = layer_value;
966 if (outinfo->writes_viewport_index == true) {
967 if (ctx->options->gfx_level >= GFX9) {
968 /* GFX9 has the layer in out.z[10:0] and the viewport
969 * index in out.z[19:16].
970 */
971 LLVMValueRef v = viewport_value;
972 v = ac_to_integer(&ctx->ac, v);
973 v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
974 v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
975
976 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
977 pos_args[1].enabled_channels |= 1 << 2;
978 } else {
979 pos_args[1].out[3] = viewport_value;
980 pos_args[1].enabled_channels |= 1 << 3;
981 }
982 }
983
984 if (outinfo->writes_primitive_shading_rate) {
985 pos_args[1].out[1] = primitive_shading_rate;
986 }
987 }
988
989 /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
990 * Setting valid_mask=1 prevents it and has no other effect.
991 */
992 if (ctx->ac.gfx_level == GFX10)
993 pos_args[0].valid_mask = 1;
994
995 pos_idx = 0;
996 for (i = 0; i < 4; i++) {
997 if (!pos_args[i].out[0])
998 continue;
999
1000 /* Specify the target we are exporting */
1001 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1002
1003 if (pos_idx == outinfo->pos_exports)
1004 /* Specify that this is the last export */
1005 pos_args[i].done = 1;
1006
1007 ac_build_export(&ctx->ac, &pos_args[i]);
1008 }
1009
1010 /* Build parameter exports */
1011 radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1012 }
1013
1014 static void
handle_vs_outputs_post(struct radv_shader_context * ctx,bool export_prim_id,bool export_clip_dists,const struct radv_vs_output_info * outinfo)1015 handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1016 const struct radv_vs_output_info *outinfo)
1017 {
1018 struct radv_shader_output_values *outputs;
1019 unsigned noutput = 0;
1020
1021 if (ctx->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader && ctx->stage != MESA_SHADER_GEOMETRY) {
1022 /* The GS copy shader emission already emits streamout. */
1023 radv_emit_streamout(ctx, 0);
1024 }
1025
1026 /* Allocate a temporary array for the output values. */
1027 unsigned num_outputs = util_bitcount64(ctx->output_mask);
1028 outputs = malloc(num_outputs * sizeof(outputs[0]));
1029
1030 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1031 if (!(ctx->output_mask & (1ull << i)))
1032 continue;
1033
1034 outputs[noutput].slot_name = i;
1035 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1036
1037 if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1038 outputs[noutput].usage_mask = ctx->shader_info->vs.output_usage_mask[i];
1039 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1040 outputs[noutput].usage_mask = ctx->shader_info->tes.output_usage_mask[i];
1041 } else if (ctx->args->is_gs_copy_shader|| ctx->stage == MESA_SHADER_GEOMETRY) {
1042 outputs[noutput].usage_mask = ctx->shader_info->gs.output_usage_mask[i];
1043 }
1044
1045 for (unsigned j = 0; j < 4; j++) {
1046 outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1047 }
1048
1049 noutput++;
1050 }
1051
1052 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1053
1054 free(outputs);
1055 }
1056
1057 static bool
si_export_mrt_color(struct radv_shader_context * ctx,LLVMValueRef * color,unsigned index,struct ac_export_args * args)1058 si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
1059 struct ac_export_args *args)
1060 {
1061 /* Export */
1062 si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
1063 if (!args->enabled_channels)
1064 return false; /* unnecessary NULL export */
1065
1066 return true;
1067 }
1068
1069 static void
radv_export_mrt_z(struct radv_shader_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask)1070 radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
1071 LLVMValueRef samplemask)
1072 {
1073 struct ac_export_args args;
1074
1075 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, NULL, true, &args);
1076
1077 ac_build_export(&ctx->ac, &args);
1078 }
1079
1080 static void
handle_fs_outputs_post(struct radv_shader_context * ctx)1081 handle_fs_outputs_post(struct radv_shader_context *ctx)
1082 {
1083 unsigned index = 0;
1084 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
1085 struct ac_export_args color_args[8];
1086
1087 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1088 LLVMValueRef values[4];
1089
1090 if (!(ctx->output_mask & (1ull << i)))
1091 continue;
1092
1093 if (i < FRAG_RESULT_DATA0)
1094 continue;
1095
1096 for (unsigned j = 0; j < 4; j++)
1097 values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1098
1099 bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
1100 if (ret)
1101 index++;
1102 }
1103
1104 /* Process depth, stencil, samplemask. */
1105 if (ctx->shader_info->ps.writes_z) {
1106 depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
1107 }
1108 if (ctx->shader_info->ps.writes_stencil) {
1109 stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
1110 }
1111 if (ctx->shader_info->ps.writes_sample_mask) {
1112 samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
1113 }
1114
1115 /* Set the DONE bit on last non-null color export only if Z isn't
1116 * exported.
1117 */
1118 if (index > 0 && !ctx->shader_info->ps.writes_z &&
1119 !ctx->shader_info->ps.writes_stencil &&
1120 !ctx->shader_info->ps.writes_sample_mask) {
1121 unsigned last = index - 1;
1122
1123 color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
1124 color_args[last].done = 1; /* DONE bit */
1125 }
1126
1127 /* Export PS outputs. */
1128 for (unsigned i = 0; i < index; i++)
1129 ac_build_export(&ctx->ac, &color_args[i]);
1130
1131 if (depth || stencil || samplemask)
1132 radv_export_mrt_z(ctx, depth, stencil, samplemask);
1133 else if (!index)
1134 ac_build_export_null(&ctx->ac, true);
1135 }
1136
1137 static void
emit_gs_epilogue(struct radv_shader_context * ctx)1138 emit_gs_epilogue(struct radv_shader_context *ctx)
1139 {
1140 if (ctx->ac.gfx_level >= GFX10)
1141 ac_build_waitcnt(&ctx->ac, AC_WAIT_VSTORE);
1142
1143 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
1144 }
1145
1146 static void
handle_shader_outputs_post(struct ac_shader_abi * abi)1147 handle_shader_outputs_post(struct ac_shader_abi *abi)
1148 {
1149 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1150
1151 switch (ctx->stage) {
1152 case MESA_SHADER_VERTEX:
1153 if (ctx->shader_info->vs.as_ls)
1154 break; /* Lowered in NIR */
1155 else if (ctx->shader_info->vs.as_es)
1156 break; /* Lowered in NIR */
1157 else if (ctx->shader_info->is_ngg)
1158 break; /* Lowered in NIR */
1159 else
1160 handle_vs_outputs_post(ctx, ctx->shader_info->vs.outinfo.export_prim_id,
1161 ctx->shader_info->vs.outinfo.export_clip_dists,
1162 &ctx->shader_info->vs.outinfo);
1163 break;
1164 case MESA_SHADER_FRAGMENT:
1165 handle_fs_outputs_post(ctx);
1166 break;
1167 case MESA_SHADER_GEOMETRY:
1168 if (ctx->shader_info->is_ngg)
1169 break; /* Lowered in NIR */
1170 else
1171 emit_gs_epilogue(ctx);
1172 break;
1173 case MESA_SHADER_TESS_CTRL:
1174 break; /* Lowered in NIR */
1175 case MESA_SHADER_TESS_EVAL:
1176 if (ctx->shader_info->tes.as_es)
1177 break; /* Lowered in NIR */
1178 else if (ctx->shader_info->is_ngg)
1179 break; /* Lowered in NIR */
1180 else
1181 handle_vs_outputs_post(ctx, ctx->shader_info->tes.outinfo.export_prim_id,
1182 ctx->shader_info->tes.outinfo.export_clip_dists,
1183 &ctx->shader_info->tes.outinfo);
1184 break;
1185 default:
1186 break;
1187 }
1188 }
1189
1190 static void
ac_llvm_finalize_module(struct radv_shader_context * ctx,LLVMPassManagerRef passmgr)1191 ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr)
1192 {
1193 LLVMRunPassManager(passmgr, ctx->ac.module);
1194 LLVMDisposeBuilder(ctx->ac.builder);
1195
1196 ac_llvm_context_dispose(&ctx->ac);
1197 }
1198
1199 static void
radv_llvm_visit_export_vertex(struct ac_shader_abi * abi)1200 radv_llvm_visit_export_vertex(struct ac_shader_abi *abi)
1201 {
1202 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1203 const struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
1204 ? &ctx->shader_info->tes.outinfo
1205 : &ctx->shader_info->vs.outinfo;
1206
1207 handle_vs_outputs_post(ctx, false,
1208 outinfo->export_clip_dists,
1209 outinfo);
1210 }
1211
1212 static void
ac_setup_rings(struct radv_shader_context * ctx)1213 ac_setup_rings(struct radv_shader_context *ctx)
1214 {
1215 if (ctx->options->gfx_level <= GFX8 &&
1216 (ctx->stage == MESA_SHADER_GEOMETRY ||
1217 (ctx->stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_es) ||
1218 (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->shader_info->tes.as_es))) {
1219 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
1220 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
1221
1222 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
1223 }
1224
1225 if (ctx->args->is_gs_copy_shader) {
1226 ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
1227 LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
1228 }
1229
1230 if (ctx->stage == MESA_SHADER_GEOMETRY) {
1231 /* The conceptual layout of the GSVS ring is
1232 * v0c0 .. vLv0 v0c1 .. vLc1 ..
1233 * but the real memory layout is swizzled across
1234 * threads:
1235 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
1236 * t16v0c0 ..
1237 * Override the buffer descriptor accordingly.
1238 */
1239 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
1240 uint64_t stream_offset = 0;
1241 unsigned num_records = ctx->ac.wave_size;
1242 LLVMValueRef base_ring;
1243
1244 base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
1245 LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
1246
1247 for (unsigned stream = 0; stream < 4; stream++) {
1248 unsigned num_components, stride;
1249 LLVMValueRef ring, tmp;
1250
1251 num_components = ctx->shader_info->gs.num_stream_output_components[stream];
1252
1253 if (!num_components)
1254 continue;
1255
1256 stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
1257
1258 /* Limit on the stride field for <= GFX7. */
1259 assert(stride < (1 << 14));
1260
1261 ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
1262 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
1263 tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
1264 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
1265
1266 stream_offset += stride * ctx->ac.wave_size;
1267
1268 ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
1269
1270 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
1271 tmp = LLVMBuildOr(ctx->ac.builder, tmp,
1272 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
1273 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
1274
1275 ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
1276 LLVMConstInt(ctx->ac.i32, num_records, false),
1277 LLVMConstInt(ctx->ac.i32, 2, false), "");
1278
1279 ctx->gsvs_ring[stream] = ring;
1280 }
1281 }
1282
1283 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
1284 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
1285 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
1286 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
1287 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
1288 }
1289 }
1290
1291 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
1292 static void
ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context * ctx)1293 ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
1294 {
1295 LLVMValueRef count =
1296 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
1297 LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
1298 ctx->abi.instance_id =
1299 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
1300 ctx->abi.instance_id, "");
1301 ctx->vs_rel_patch_id =
1302 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
1303 ctx->vs_rel_patch_id, "");
1304 ctx->abi.vertex_id =
1305 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
1306 ctx->abi.vertex_id, "");
1307 }
1308
1309 static void
prepare_gs_input_vgprs(struct radv_shader_context * ctx,bool merged)1310 prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
1311 {
1312 if (merged) {
1313 ctx->gs_wave_id =
1314 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
1315 } else {
1316 ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
1317 }
1318 }
1319
1320 /* Ensure that the esgs ring is declared.
1321 *
1322 * We declare it with 64KB alignment as a hint that the
1323 * pointer value will always be 0.
1324 */
1325 static void
declare_esgs_ring(struct radv_shader_context * ctx)1326 declare_esgs_ring(struct radv_shader_context *ctx)
1327 {
1328 if (ctx->esgs_ring)
1329 return;
1330
1331 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
1332
1333 ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
1334 "esgs_ring", AC_ADDR_SPACE_LDS);
1335 LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
1336 LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
1337 }
1338
radv_intrinsic_load(struct ac_shader_abi * abi,nir_intrinsic_op op)1339 static LLVMValueRef radv_intrinsic_load(struct ac_shader_abi *abi, nir_intrinsic_op op)
1340 {
1341 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
1342
1343 switch (op) {
1344 case nir_intrinsic_load_base_vertex:
1345 case nir_intrinsic_load_first_vertex:
1346 return radv_load_base_vertex(abi, op == nir_intrinsic_load_base_vertex);
1347 case nir_intrinsic_load_ring_tess_factors_amd:
1348 return ctx->hs_ring_tess_factor;
1349 case nir_intrinsic_load_ring_tess_offchip_amd:
1350 return ctx->hs_ring_tess_offchip;
1351 case nir_intrinsic_load_ring_esgs_amd:
1352 return ctx->esgs_ring;
1353 default:
1354 return NULL;
1355 }
1356 }
1357
1358 static LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct nir_shader * const * shaders,int shader_count,const struct radv_shader_args * args)1359 ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
1360 const struct radv_nir_compiler_options *options,
1361 const struct radv_shader_info *info,
1362 struct nir_shader *const *shaders, int shader_count,
1363 const struct radv_shader_args *args)
1364 {
1365 struct radv_shader_context ctx = {0};
1366 ctx.args = args;
1367 ctx.options = options;
1368 ctx.shader_info = info;
1369
1370 enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
1371
1372 if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
1373 float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
1374 }
1375
1376 ac_llvm_context_init(&ctx.ac, ac_llvm, options->gfx_level, options->family,
1377 options->has_3d_cube_border_color_mipmap,
1378 float_mode, info->wave_size, info->ballot_bit_size);
1379 ctx.context = ctx.ac.context;
1380
1381 ctx.max_workgroup_size = info->workgroup_size;
1382
1383 create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
1384
1385 ctx.abi.intrinsic_load = radv_intrinsic_load;
1386 ctx.abi.load_ubo = radv_load_ubo;
1387 ctx.abi.load_ssbo = radv_load_ssbo;
1388 ctx.abi.load_sampler_desc = radv_get_sampler_desc;
1389 ctx.abi.clamp_shadow_reference = false;
1390 ctx.abi.robust_buffer_access = options->robust_buffer_access;
1391 ctx.abi.load_grid_size_from_user_sgpr = args->load_grid_size_from_user_sgpr;
1392
1393 bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
1394 if (shader_count >= 2 || is_ngg)
1395 ac_init_exec_full_mask(&ctx.ac);
1396
1397 if (args->ac.vertex_id.used)
1398 ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
1399 if (args->ac.vs_rel_patch_id.used)
1400 ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
1401 if (args->ac.instance_id.used)
1402 ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
1403
1404 if (options->has_ls_vgpr_init_bug &&
1405 shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
1406 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
1407
1408 if (is_ngg) {
1409 ctx.abi.export_vertex = radv_llvm_visit_export_vertex;
1410
1411 if (!info->is_ngg_passthrough)
1412 declare_esgs_ring(&ctx);
1413
1414 if (ctx.stage == MESA_SHADER_GEOMETRY) {
1415 /* Scratch space used by NGG GS for repacking vertices at the end. */
1416 LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
1417 LLVMValueRef gs_ngg_scratch =
1418 LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1419 LLVMSetInitializer(gs_ngg_scratch, LLVMGetUndef(ai32));
1420 LLVMSetLinkage(gs_ngg_scratch, LLVMExternalLinkage);
1421 LLVMSetAlignment(gs_ngg_scratch, 4);
1422
1423 /* Vertex emit space used by NGG GS for storing all vertex attributes. */
1424 LLVMValueRef gs_ngg_emit =
1425 LLVMAddGlobalInAddressSpace(ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1426 LLVMSetInitializer(gs_ngg_emit, LLVMGetUndef(ai32));
1427 LLVMSetLinkage(gs_ngg_emit, LLVMExternalLinkage);
1428 LLVMSetAlignment(gs_ngg_emit, 4);
1429 }
1430
1431 /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
1432 if (ctx.ac.gfx_level == GFX10 && shader_count == 1)
1433 ac_build_s_barrier(&ctx.ac, shaders[0]->info.stage);
1434 }
1435
1436 for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
1437 ctx.stage = shaders[shader_idx]->info.stage;
1438 ctx.shader = shaders[shader_idx];
1439 ctx.output_mask = 0;
1440
1441 if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && !ctx.shader_info->is_ngg) {
1442 ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
1443 ctx.abi.emit_primitive = visit_end_primitive;
1444 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
1445 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
1446 ctx.abi.load_inputs = radv_load_vs_inputs;
1447 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
1448 ctx.abi.load_sample_position = load_sample_position;
1449 }
1450
1451 if (shader_idx && !(shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && info->is_ngg)) {
1452 /* Execute a barrier before the second shader in
1453 * a merged shader.
1454 *
1455 * Execute the barrier inside the conditional block,
1456 * so that empty waves can jump directly to s_endpgm,
1457 * which will also signal the barrier.
1458 *
1459 * This is possible in gfx9, because an empty wave
1460 * for the second shader does not participate in
1461 * the epilogue. With NGG, empty waves may still
1462 * be required to export data (e.g. GS output vertices),
1463 * so we cannot let them exit early.
1464 *
1465 * If the shader is TCS and the TCS epilog is present
1466 * and contains a barrier, it will wait there and then
1467 * reach s_endpgm.
1468 */
1469 ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
1470 ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
1471 }
1472
1473 nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
1474 &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
1475
1476 ac_setup_rings(&ctx);
1477
1478 bool check_merged_wave_info = shader_count >= 2 && !(is_ngg && shader_idx == 1);
1479 LLVMBasicBlockRef merge_block = NULL;
1480
1481 if (check_merged_wave_info) {
1482 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
1483 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
1484 merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
1485
1486 LLVMValueRef count = ac_unpack_param(
1487 &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
1488 LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
1489 LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
1490 LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
1491
1492 LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
1493 }
1494
1495 if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
1496 prepare_interp_optimize(&ctx, shaders[shader_idx]);
1497 else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
1498 prepare_gs_input_vgprs(&ctx, shader_count >= 2);
1499
1500 ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
1501
1502 if (!gl_shader_stage_is_compute(shaders[shader_idx]->info.stage))
1503 handle_shader_outputs_post(&ctx.abi);
1504
1505 if (check_merged_wave_info) {
1506 LLVMBuildBr(ctx.ac.builder, merge_block);
1507 LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
1508 }
1509 }
1510
1511 LLVMBuildRetVoid(ctx.ac.builder);
1512
1513 if (options->dump_preoptir) {
1514 fprintf(stderr, "%s LLVM IR:\n\n",
1515 radv_get_shader_name(info, shaders[shader_count - 1]->info.stage));
1516 ac_dump_module(ctx.ac.module);
1517 fprintf(stderr, "\n");
1518 }
1519
1520 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
1521
1522 return ctx.ac.module;
1523 }
1524
1525 static void
ac_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)1526 ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
1527 {
1528 unsigned *retval = (unsigned *)context;
1529 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
1530 char *description = LLVMGetDiagInfoDescription(di);
1531
1532 if (severity == LLVMDSError) {
1533 *retval = 1;
1534 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
1535 }
1536
1537 LLVMDisposeMessage(description);
1538 }
1539
1540 static unsigned
radv_llvm_compile(LLVMModuleRef M,char ** pelf_buffer,size_t * pelf_size,struct ac_llvm_compiler * ac_llvm)1541 radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
1542 struct ac_llvm_compiler *ac_llvm)
1543 {
1544 unsigned retval = 0;
1545 LLVMContextRef llvm_ctx;
1546
1547 /* Setup Diagnostic Handler*/
1548 llvm_ctx = LLVMGetModuleContext(M);
1549
1550 LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
1551
1552 /* Compile IR*/
1553 if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
1554 retval = 1;
1555 return retval;
1556 }
1557
1558 static void
ac_compile_llvm_module(struct ac_llvm_compiler * ac_llvm,LLVMModuleRef llvm_module,struct radv_shader_binary ** rbinary,gl_shader_stage stage,const char * name,const struct radv_nir_compiler_options * options)1559 ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
1560 struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
1561 const struct radv_nir_compiler_options *options)
1562 {
1563 char *elf_buffer = NULL;
1564 size_t elf_size = 0;
1565 char *llvm_ir_string = NULL;
1566
1567 if (options->dump_shader) {
1568 fprintf(stderr, "%s LLVM IR:\n\n", name);
1569 ac_dump_module(llvm_module);
1570 fprintf(stderr, "\n");
1571 }
1572
1573 if (options->record_ir) {
1574 char *llvm_ir = LLVMPrintModuleToString(llvm_module);
1575 llvm_ir_string = strdup(llvm_ir);
1576 LLVMDisposeMessage(llvm_ir);
1577 }
1578
1579 int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
1580 if (v) {
1581 fprintf(stderr, "compile failed\n");
1582 }
1583
1584 LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
1585 LLVMDisposeModule(llvm_module);
1586 LLVMContextDispose(ctx);
1587
1588 size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
1589 size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
1590 struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
1591 memcpy(rbin->data, elf_buffer, elf_size);
1592 if (llvm_ir_string)
1593 memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
1594
1595 rbin->base.type = RADV_BINARY_TYPE_RTLD;
1596 rbin->base.stage = stage;
1597 rbin->base.total_size = alloc_size;
1598 rbin->elf_size = elf_size;
1599 rbin->llvm_ir_size = llvm_ir_size;
1600 *rbinary = &rbin->base;
1601
1602 free(llvm_ir_string);
1603 free(elf_buffer);
1604 }
1605
1606 static void
radv_compile_nir_shader(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args,struct nir_shader * const * nir,int nir_count)1607 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
1608 const struct radv_nir_compiler_options *options,
1609 const struct radv_shader_info *info,
1610 struct radv_shader_binary **rbinary,
1611 const struct radv_shader_args *args, struct nir_shader *const *nir,
1612 int nir_count)
1613 {
1614
1615 LLVMModuleRef llvm_module;
1616
1617 llvm_module = ac_translate_nir_to_llvm(ac_llvm, options, info, nir, nir_count, args);
1618
1619 ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
1620 radv_get_shader_name(info, nir[nir_count - 1]->info.stage),
1621 options);
1622 }
1623
1624 static void
ac_gs_copy_shader_emit(struct radv_shader_context * ctx)1625 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
1626 {
1627 LLVMValueRef vtx_offset =
1628 LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
1629 LLVMConstInt(ctx->ac.i32, 4, false), "");
1630 LLVMValueRef stream_id;
1631
1632 /* Fetch the vertex stream ID. */
1633 if (ctx->shader_info->so.num_outputs) {
1634 stream_id =
1635 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
1636 } else {
1637 stream_id = ctx->ac.i32_0;
1638 }
1639
1640 LLVMBasicBlockRef end_bb;
1641 LLVMValueRef switch_inst;
1642
1643 end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
1644 switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
1645
1646 for (unsigned stream = 0; stream < 4; stream++) {
1647 unsigned num_components = ctx->shader_info->gs.num_stream_output_components[stream];
1648 LLVMBasicBlockRef bb;
1649 unsigned offset;
1650
1651 if (stream > 0 && !num_components)
1652 continue;
1653
1654 if (stream > 0 && !ctx->shader_info->so.num_outputs)
1655 continue;
1656
1657 bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
1658 LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
1659 LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
1660
1661 offset = 0;
1662 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1663 unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
1664 unsigned output_stream = ctx->shader_info->gs.output_streams[i];
1665 int length = util_last_bit(output_usage_mask);
1666
1667 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
1668 continue;
1669
1670 for (unsigned j = 0; j < length; j++) {
1671 LLVMValueRef value, soffset;
1672
1673 if (!(output_usage_mask & (1 << j)))
1674 continue;
1675
1676 soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
1677 false);
1678
1679 offset++;
1680
1681 value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
1682 soffset, ctx->ac.f32, ac_glc | ac_slc, true, false);
1683
1684 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1685 if (ac_get_type_size(type) == 2) {
1686 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
1687 value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
1688 }
1689
1690 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
1691 ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1692 }
1693 }
1694
1695 if (ctx->shader_info->so.num_outputs)
1696 radv_emit_streamout(ctx, stream);
1697
1698 if (stream == 0) {
1699 handle_vs_outputs_post(ctx, false, ctx->shader_info->vs.outinfo.export_clip_dists,
1700 &ctx->shader_info->vs.outinfo);
1701 }
1702
1703 LLVMBuildBr(ctx->ac.builder, end_bb);
1704 }
1705
1706 LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
1707 }
1708
1709 static void
radv_compile_gs_copy_shader(struct ac_llvm_compiler * ac_llvm,const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,struct nir_shader * geom_shader,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args)1710 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
1711 const struct radv_nir_compiler_options *options,
1712 const struct radv_shader_info *info,
1713 struct nir_shader *geom_shader,
1714 struct radv_shader_binary **rbinary,
1715 const struct radv_shader_args *args)
1716 {
1717 struct radv_shader_context ctx = {0};
1718 ctx.args = args;
1719 ctx.options = options;
1720 ctx.shader_info = info;
1721
1722 assert(args->is_gs_copy_shader);
1723
1724 ac_llvm_context_init(&ctx.ac, ac_llvm, options->gfx_level, options->family,
1725 options->has_3d_cube_border_color_mipmap,
1726 AC_FLOAT_MODE_DEFAULT, 64, 64);
1727 ctx.context = ctx.ac.context;
1728
1729 ctx.stage = MESA_SHADER_VERTEX;
1730 ctx.shader = geom_shader;
1731
1732 create_function(&ctx, MESA_SHADER_VERTEX, false);
1733
1734 ac_setup_rings(&ctx);
1735
1736 nir_foreach_shader_out_variable(variable, geom_shader)
1737 {
1738 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
1739 ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
1740 }
1741
1742 ac_gs_copy_shader_emit(&ctx);
1743
1744 LLVMBuildRetVoid(ctx.ac.builder);
1745
1746 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
1747
1748 ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
1749 options);
1750 (*rbinary)->is_gs_copy_shader = true;
1751 }
1752
1753 void
llvm_compile_shader(const struct radv_nir_compiler_options * options,const struct radv_shader_info * info,unsigned shader_count,struct nir_shader * const * shaders,struct radv_shader_binary ** binary,const struct radv_shader_args * args)1754 llvm_compile_shader(const struct radv_nir_compiler_options *options,
1755 const struct radv_shader_info *info, unsigned shader_count,
1756 struct nir_shader *const *shaders, struct radv_shader_binary **binary,
1757 const struct radv_shader_args *args)
1758 {
1759 enum ac_target_machine_options tm_options = 0;
1760 struct ac_llvm_compiler ac_llvm;
1761
1762 tm_options |= AC_TM_SUPPORTS_SPILL;
1763 if (options->check_ir)
1764 tm_options |= AC_TM_CHECK_IR;
1765
1766 radv_init_llvm_compiler(&ac_llvm, options->family, tm_options, info->wave_size);
1767
1768 if (args->is_gs_copy_shader) {
1769 radv_compile_gs_copy_shader(&ac_llvm, options, info, *shaders, binary, args);
1770 } else {
1771 radv_compile_nir_shader(&ac_llvm, options, info, binary, args, shaders, shader_count);
1772 }
1773 }
1774