1 /*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir.h"
8 #include "ac_nir_to_llvm.h"
9 #include "ac_rtld.h"
10 #include "si_pipe.h"
11 #include "si_shader_internal.h"
12 #include "si_shader_llvm.h"
13 #include "sid.h"
14 #include "util/u_memory.h"
15 #include "util/u_prim.h"
16
17 struct si_llvm_diagnostics {
18 struct util_debug_callback *debug;
19 unsigned retval;
20 };
21
si_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)22 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
23 {
24 struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
25 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
26 const char *severity_str = NULL;
27
28 switch (severity) {
29 case LLVMDSError:
30 severity_str = "error";
31 break;
32 case LLVMDSWarning:
33 severity_str = "warning";
34 break;
35 case LLVMDSRemark:
36 case LLVMDSNote:
37 default:
38 return;
39 }
40
41 char *description = LLVMGetDiagInfoDescription(di);
42
43 util_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
44 description);
45
46 if (severity == LLVMDSError) {
47 diag->retval = 1;
48 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
49 }
50
51 LLVMDisposeMessage(description);
52 }
53
si_compile_llvm(struct si_screen * sscreen,struct si_shader_binary * binary,struct ac_shader_config * conf,struct ac_llvm_compiler * compiler,struct ac_llvm_context * ac,struct util_debug_callback * debug,gl_shader_stage stage,const char * name,bool less_optimized)54 bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
55 struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
56 struct ac_llvm_context *ac, struct util_debug_callback *debug,
57 gl_shader_stage stage, const char *name, bool less_optimized)
58 {
59 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
60
61 if (si_can_dump_shader(sscreen, stage, SI_DUMP_LLVM_IR)) {
62 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
63
64 fprintf(stderr, "%s LLVM IR:\n\n", name);
65 ac_dump_module(ac->module);
66 fprintf(stderr, "\n");
67 }
68
69 if (sscreen->record_llvm_ir) {
70 char *ir = LLVMPrintModuleToString(ac->module);
71 binary->llvm_ir_string = strdup(ir);
72 LLVMDisposeMessage(ir);
73 }
74
75 if (!si_replace_shader(count, binary)) {
76 struct ac_compiler_passes *passes = compiler->passes;
77
78 if (less_optimized && compiler->low_opt_passes)
79 passes = compiler->low_opt_passes;
80
81 struct si_llvm_diagnostics diag = {debug};
82 LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
83
84 if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->code_buffer,
85 &binary->code_size))
86 diag.retval = 1;
87
88 if (diag.retval != 0) {
89 util_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
90 return false;
91 }
92
93 binary->type = SI_SHADER_BINARY_ELF;
94 }
95
96 struct ac_rtld_binary rtld;
97 if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
98 .info = &sscreen->info,
99 .shader_type = stage,
100 .wave_size = ac->wave_size,
101 .num_parts = 1,
102 .elf_ptrs = &binary->code_buffer,
103 .elf_sizes = &binary->code_size}))
104 return false;
105
106 bool ok = ac_rtld_read_config(&sscreen->info, &rtld, conf);
107 ac_rtld_close(&rtld);
108 return ok;
109 }
110
si_llvm_context_init(struct si_shader_context * ctx,struct si_screen * sscreen,struct ac_llvm_compiler * compiler,unsigned wave_size,bool exports_color_null,bool exports_mrtz,enum ac_float_mode float_mode)111 void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
112 struct ac_llvm_compiler *compiler, unsigned wave_size,
113 bool exports_color_null, bool exports_mrtz,
114 enum ac_float_mode float_mode)
115 {
116 memset(ctx, 0, sizeof(*ctx));
117 ctx->screen = sscreen;
118 ctx->compiler = compiler;
119
120 ac_llvm_context_init(&ctx->ac, compiler, &sscreen->info, float_mode,
121 wave_size, 64, exports_color_null, exports_mrtz);
122 }
123
si_llvm_create_func(struct si_shader_context * ctx,const char * name,LLVMTypeRef * return_types,unsigned num_return_elems,unsigned max_workgroup_size)124 void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
125 unsigned num_return_elems, unsigned max_workgroup_size)
126 {
127 LLVMTypeRef ret_type;
128 enum ac_llvm_calling_convention call_conv;
129
130 if (num_return_elems)
131 ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
132 else
133 ret_type = ctx->ac.voidt;
134
135 gl_shader_stage real_stage = ctx->stage;
136
137 /* LS is merged into HS (TCS), and ES is merged into GS. */
138 if (ctx->screen->info.gfx_level >= GFX9 && ctx->stage <= MESA_SHADER_GEOMETRY) {
139 if (ctx->shader->key.ge.as_ls)
140 real_stage = MESA_SHADER_TESS_CTRL;
141 else if (ctx->shader->key.ge.as_es || ctx->shader->key.ge.as_ngg)
142 real_stage = MESA_SHADER_GEOMETRY;
143 }
144
145 switch (real_stage) {
146 case MESA_SHADER_VERTEX:
147 case MESA_SHADER_TESS_EVAL:
148 call_conv = AC_LLVM_AMDGPU_VS;
149 break;
150 case MESA_SHADER_TESS_CTRL:
151 call_conv = AC_LLVM_AMDGPU_HS;
152 break;
153 case MESA_SHADER_GEOMETRY:
154 call_conv = AC_LLVM_AMDGPU_GS;
155 break;
156 case MESA_SHADER_FRAGMENT:
157 call_conv = AC_LLVM_AMDGPU_PS;
158 break;
159 case MESA_SHADER_COMPUTE:
160 call_conv = AC_LLVM_AMDGPU_CS;
161 break;
162 default:
163 unreachable("Unhandle shader type");
164 }
165
166 /* Setup the function */
167 ctx->return_type = ret_type;
168 ctx->main_fn = ac_build_main(&ctx->args->ac, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
169 ctx->return_value = LLVMGetUndef(ctx->return_type);
170
171 if (ctx->screen->info.address32_hi) {
172 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-32bit-address-high-bits",
173 ctx->screen->info.address32_hi);
174 }
175
176 if (ctx->stage <= MESA_SHADER_GEOMETRY && ctx->shader->key.ge.as_ngg &&
177 si_shader_uses_streamout(ctx->shader))
178 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-gds-size", 256);
179
180 ac_llvm_set_workgroup_size(ctx->main_fn.value, max_workgroup_size);
181 ac_llvm_set_target_features(ctx->main_fn.value, &ctx->ac, false);
182 }
183
si_llvm_create_main_func(struct si_shader_context * ctx)184 void si_llvm_create_main_func(struct si_shader_context *ctx)
185 {
186 struct si_shader *shader = ctx->shader;
187 LLVMTypeRef returns[AC_MAX_ARGS];
188 unsigned i;
189
190 for (i = 0; i < ctx->args->ac.num_sgprs_returned; i++)
191 returns[i] = ctx->ac.i32; /* SGPR */
192 for (; i < ctx->args->ac.return_count; i++)
193 returns[i] = ctx->ac.f32; /* VGPR */
194
195 si_llvm_create_func(ctx, "main", returns, ctx->args->ac.return_count,
196 si_get_max_workgroup_size(shader));
197
198 /* Reserve register locations for VGPR inputs the PS prolog may need. */
199 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
200 ac_llvm_add_target_dep_function_attr(
201 ctx->main_fn.value, "InitialPSInputAddr", SI_SPI_PS_INPUT_ADDR_FOR_PROLOG);
202 }
203
204
205 if (ctx->stage <= MESA_SHADER_GEOMETRY &&
206 (shader->key.ge.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL)) {
207 /* The LSHS size is not known until draw time, so we append it
208 * at the end of whatever LDS use there may be in the rest of
209 * the shader (currently none, unless LLVM decides to do its
210 * own LDS-based lowering).
211 */
212 ctx->ac.lds = (struct ac_llvm_pointer) {
213 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
214 "__lds_end", AC_ADDR_SPACE_LDS),
215 .pointee_type = LLVMArrayType(ctx->ac.i32, 0)
216 };
217 LLVMSetAlignment(ctx->ac.lds.value, 256);
218 }
219
220 if (ctx->stage == MESA_SHADER_VERTEX) {
221 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id);
222 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args->ac.instance_id);
223 if (ctx->args->ac.vs_rel_patch_id.used)
224 ctx->abi.vs_rel_patch_id = ac_get_arg(&ctx->ac, ctx->args->ac.vs_rel_patch_id);
225
226 /* Apply the LS-HS input VGPR hw bug workaround. */
227 if (shader->key.ge.as_ls && ctx->screen->info.has_ls_vgpr_init_bug)
228 ac_fixup_ls_hs_input_vgprs(&ctx->ac, &ctx->abi, &ctx->args->ac);
229 }
230 }
231
si_llvm_optimize_module(struct si_shader_context * ctx)232 void si_llvm_optimize_module(struct si_shader_context *ctx)
233 {
234 /* Dump LLVM IR before any optimization passes */
235 if (si_can_dump_shader(ctx->screen, ctx->stage, SI_DUMP_INIT_LLVM_IR))
236 ac_dump_module(ctx->ac.module);
237
238 /* Run the pass */
239 LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
240 LLVMDisposeBuilder(ctx->ac.builder);
241 }
242
si_llvm_dispose(struct si_shader_context * ctx)243 void si_llvm_dispose(struct si_shader_context *ctx)
244 {
245 LLVMDisposeModule(ctx->ac.module);
246 LLVMContextDispose(ctx->ac.context);
247 ac_llvm_context_dispose(&ctx->ac);
248 }
249
250 /**
251 * Load a dword from a constant buffer.
252 */
si_buffer_load_const(struct si_shader_context * ctx,LLVMValueRef resource,LLVMValueRef offset)253 LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
254 LLVMValueRef offset)
255 {
256 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, ctx->ac.f32,
257 0, true, true);
258 }
259
si_llvm_build_ret(struct si_shader_context * ctx,LLVMValueRef ret)260 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
261 {
262 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
263 LLVMBuildRetVoid(ctx->ac.builder);
264 else
265 LLVMBuildRet(ctx->ac.builder, ret);
266 }
267
si_insert_input_ret(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)268 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
269 struct ac_arg param, unsigned return_index)
270 {
271 return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
272 }
273
si_insert_input_ret_float(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)274 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
275 struct ac_arg param, unsigned return_index)
276 {
277 LLVMBuilderRef builder = ctx->ac.builder;
278 LLVMValueRef p = ac_get_arg(&ctx->ac, param);
279
280 return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
281 }
282
si_insert_input_ptr(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)283 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
284 struct ac_arg param, unsigned return_index)
285 {
286 LLVMBuilderRef builder = ctx->ac.builder;
287 LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
288 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
289 return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
290 }
291
si_prolog_get_internal_binding_slot(struct si_shader_context * ctx,unsigned slot)292 LLVMValueRef si_prolog_get_internal_binding_slot(struct si_shader_context *ctx, unsigned slot)
293 {
294 LLVMValueRef list = LLVMBuildIntToPtr(
295 ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->internal_bindings),
296 ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
297 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, slot, 0);
298
299 return ac_build_load_to_sgpr(&ctx->ac,
300 (struct ac_llvm_pointer) { .t = ctx->ac.v4i32, .v = list },
301 index);
302 }
303
304 /* Ensure that the esgs ring is declared.
305 *
306 * We declare it with 64KB alignment as a hint that the
307 * pointer value will always be 0.
308 */
si_llvm_declare_lds_esgs_ring(struct si_shader_context * ctx)309 static void si_llvm_declare_lds_esgs_ring(struct si_shader_context *ctx)
310 {
311 if (ctx->ac.lds.value)
312 return;
313
314 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
315
316 LLVMValueRef esgs_ring =
317 LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
318 "esgs_ring", AC_ADDR_SPACE_LDS);
319 LLVMSetLinkage(esgs_ring, LLVMExternalLinkage);
320 LLVMSetAlignment(esgs_ring, 64 * 1024);
321
322 ctx->ac.lds.value = esgs_ring;
323 ctx->ac.lds.pointee_type = ctx->ac.i32;
324 }
325
si_init_exec_from_input(struct si_shader_context * ctx,struct ac_arg param,unsigned bitoffset)326 static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
327 unsigned bitoffset)
328 {
329 LLVMValueRef args[] = {
330 ac_get_arg(&ctx->ac, param),
331 LLVMConstInt(ctx->ac.i32, bitoffset, 0),
332 };
333 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2, 0);
334 }
335
336 /**
337 * Get the value of a shader input parameter and extract a bitfield.
338 */
unpack_llvm_param(struct si_shader_context * ctx,LLVMValueRef value,unsigned rshift,unsigned bitwidth)339 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
340 unsigned rshift, unsigned bitwidth)
341 {
342 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
343 value = ac_to_integer(&ctx->ac, value);
344
345 if (rshift)
346 value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
347
348 if (rshift + bitwidth < 32) {
349 unsigned mask = (1 << bitwidth) - 1;
350 value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
351 }
352
353 return value;
354 }
355
si_unpack_param(struct si_shader_context * ctx,struct ac_arg param,unsigned rshift,unsigned bitwidth)356 LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
357 unsigned bitwidth)
358 {
359 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
360
361 return unpack_llvm_param(ctx, value, rshift, bitwidth);
362 }
363
si_llvm_declare_compute_memory(struct si_shader_context * ctx)364 static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
365 {
366 struct si_shader_selector *sel = ctx->shader->selector;
367 unsigned lds_size = sel->info.base.shared_size;
368
369 LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
370 LLVMValueRef var;
371
372 assert(!ctx->ac.lds.value);
373
374 LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, lds_size);
375 var = LLVMAddGlobalInAddressSpace(ctx->ac.module, type,
376 "compute_lds", AC_ADDR_SPACE_LDS);
377 LLVMSetAlignment(var, 64 * 1024);
378
379 ctx->ac.lds = (struct ac_llvm_pointer) {
380 .value = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""),
381 .pointee_type = type,
382 };
383 }
384
385 /**
386 * Given two parts (LS/HS or ES/GS) of a merged shader, build a wrapper function that
387 * runs them in sequence to form a monolithic shader.
388 */
si_build_wrapper_function(struct si_shader_context * ctx,struct ac_llvm_pointer parts[2],bool same_thread_count)389 static void si_build_wrapper_function(struct si_shader_context *ctx,
390 struct ac_llvm_pointer parts[2],
391 bool same_thread_count)
392 {
393 LLVMBuilderRef builder = ctx->ac.builder;
394
395 for (unsigned i = 0; i < 2; ++i) {
396 ac_add_function_attr(ctx->ac.context, parts[i].value, -1, "alwaysinline");
397 LLVMSetLinkage(parts[i].value, LLVMPrivateLinkage);
398 }
399
400 si_llvm_create_func(ctx, "wrapper", NULL, 0, si_get_max_workgroup_size(ctx->shader));
401
402 if (same_thread_count) {
403 si_init_exec_from_input(ctx, ctx->args->ac.merged_wave_info, 0);
404 } else {
405 ac_init_exec_full_mask(&ctx->ac);
406
407 LLVMValueRef count = ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info);
408 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
409
410 LLVMValueRef ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
411 ac_build_ifcc(&ctx->ac, ena, 6506);
412 }
413
414 LLVMValueRef params[AC_MAX_ARGS];
415 unsigned num_params = LLVMCountParams(ctx->main_fn.value);
416 LLVMGetParams(ctx->main_fn.value, params);
417
418 /* wrapper function has same parameter as first part shader */
419 LLVMValueRef ret =
420 ac_build_call(&ctx->ac, parts[0].pointee_type, parts[0].value, params, num_params);
421
422 if (same_thread_count) {
423 LLVMTypeRef type = LLVMTypeOf(ret);
424 assert(LLVMGetTypeKind(type) == LLVMStructTypeKind);
425
426 /* output of first part shader is the input of the second part */
427 num_params = LLVMCountStructElementTypes(type);
428 assert(num_params == LLVMCountParams(parts[1].value));
429
430 for (unsigned i = 0; i < num_params; i++) {
431 params[i] = LLVMBuildExtractValue(builder, ret, i, "");
432
433 /* Convert return value to same type as next shader's input param. */
434 LLVMTypeRef ret_type = LLVMTypeOf(params[i]);
435 LLVMTypeRef param_type = LLVMTypeOf(LLVMGetParam(parts[1].value, i));
436 assert(ac_get_type_size(ret_type) == 4);
437 assert(ac_get_type_size(param_type) == 4);
438
439 if (ret_type != param_type) {
440 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
441 assert(LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT);
442 assert(ret_type == ctx->ac.i32);
443
444 params[i] = LLVMBuildIntToPtr(builder, params[i], param_type, "");
445 } else {
446 params[i] = LLVMBuildBitCast(builder, params[i], param_type, "");
447 }
448 }
449 }
450 } else {
451 ac_build_endif(&ctx->ac, 6506);
452
453 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
454 LLVMValueRef count = ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info);
455 count = LLVMBuildLShr(builder, count, LLVMConstInt(ctx->ac.i32, 8, 0), "");
456 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
457
458 LLVMValueRef ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
459 ac_build_ifcc(&ctx->ac, ena, 6507);
460 }
461
462 /* The second half of the merged shader should use
463 * the inputs from the toplevel (wrapper) function,
464 * not the return value from the last call.
465 *
466 * That's because the last call was executed condi-
467 * tionally, so we can't consume it in the main
468 * block.
469 */
470
471 /* Second part params are same as the preceeding params of the first part. */
472 num_params = LLVMCountParams(parts[1].value);
473 }
474
475 ac_build_call(&ctx->ac, parts[1].pointee_type, parts[1].value, params, num_params);
476
477 /* Close the conditional wrapping the second shader. */
478 if (ctx->stage == MESA_SHADER_TESS_CTRL && !same_thread_count)
479 ac_build_endif(&ctx->ac, 6507);
480
481 LLVMBuildRetVoid(builder);
482 }
483
si_llvm_load_intrinsic(struct ac_shader_abi * abi,nir_intrinsic_instr * intrin)484 static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrinsic_instr *intrin)
485 {
486 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
487
488 switch (intrin->intrinsic) {
489 case nir_intrinsic_load_tess_rel_patch_id_amd:
490 return si_get_rel_patch_id(ctx);
491
492 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
493 return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_scratch.value, ctx->ac.i32, "");
494
495 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
496 return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_emit, ctx->ac.i32, "");
497
498 default:
499 return NULL;
500 }
501 }
502
si_llvm_load_sampler_desc(struct ac_shader_abi * abi,LLVMValueRef index,enum ac_descriptor_type desc_type)503 static LLVMValueRef si_llvm_load_sampler_desc(struct ac_shader_abi *abi, LLVMValueRef index,
504 enum ac_descriptor_type desc_type)
505 {
506 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
507 LLVMBuilderRef builder = ctx->ac.builder;
508
509 if (index && LLVMTypeOf(index) == ctx->ac.i32) {
510 bool is_vec4 = false;
511
512 switch (desc_type) {
513 case AC_DESC_IMAGE:
514 /* The image is at [0:7]. */
515 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
516 break;
517 case AC_DESC_BUFFER:
518 /* The buffer is in [4:7]. */
519 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
520 is_vec4 = true;
521 break;
522 case AC_DESC_FMASK:
523 /* The FMASK is at [8:15]. */
524 assert(ctx->screen->info.gfx_level < GFX11);
525 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
526 break;
527 case AC_DESC_SAMPLER:
528 /* The sampler state is at [12:15]. */
529 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
530 LLVMConstInt(ctx->ac.i32, 3, 0));
531 is_vec4 = true;
532 break;
533 default:
534 unreachable("invalid desc");
535 }
536
537 struct ac_llvm_pointer list = {
538 .value = ac_get_arg(&ctx->ac, ctx->args->samplers_and_images),
539 .pointee_type = is_vec4 ? ctx->ac.v4i32 : ctx->ac.v8i32,
540 };
541
542 return ac_build_load_to_sgpr(&ctx->ac, list, index);
543 }
544
545 return index;
546 }
547
si_llvm_translate_nir(struct si_shader_context * ctx,struct si_shader * shader,struct nir_shader * nir,bool free_nir)548 static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
549 struct nir_shader *nir, bool free_nir)
550 {
551 struct si_shader_selector *sel = shader->selector;
552 const struct si_shader_info *info = &sel->info;
553
554 ctx->shader = shader;
555 ctx->stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
556
557 ctx->num_const_buffers = info->base.num_ubos;
558 ctx->num_shader_buffers = info->base.num_ssbos;
559
560 ctx->num_samplers = BITSET_LAST_BIT(info->base.textures_used);
561 ctx->num_images = info->base.num_images;
562
563 ctx->abi.intrinsic_load = si_llvm_load_intrinsic;
564 ctx->abi.load_sampler_desc = si_llvm_load_sampler_desc;
565
566 si_llvm_create_main_func(ctx);
567
568 switch (ctx->stage) {
569 case MESA_SHADER_VERTEX:
570 break;
571
572 case MESA_SHADER_TESS_CTRL:
573 si_llvm_init_tcs_callbacks(ctx);
574 break;
575
576 case MESA_SHADER_GEOMETRY:
577 if (ctx->shader->key.ge.as_ngg) {
578 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
579 ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
580 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS),
581 .pointee_type = ai32
582 };
583 LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(ai32));
584 LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
585
586 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
587 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
588 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
589 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
590 }
591 break;
592
593 case MESA_SHADER_FRAGMENT: {
594 ctx->abi.kill_ps_if_inf_interp =
595 ctx->screen->options.no_infinite_interp &&
596 (ctx->shader->selector->info.uses_persp_center ||
597 ctx->shader->selector->info.uses_persp_centroid ||
598 ctx->shader->selector->info.uses_persp_sample);
599 break;
600 }
601
602 case MESA_SHADER_COMPUTE:
603 if (ctx->shader->selector->info.base.shared_size)
604 si_llvm_declare_compute_memory(ctx);
605 break;
606
607 default:
608 break;
609 }
610
611 bool is_merged_esgs_stage =
612 ctx->screen->info.gfx_level >= GFX9 && ctx->stage <= MESA_SHADER_GEOMETRY &&
613 (ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY);
614
615 bool is_nogs_ngg_stage =
616 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
617 shader->key.ge.as_ngg && !shader->key.ge.as_es;
618
619 /* Declare the ESGS ring as an explicit LDS symbol.
620 * When NGG VS/TES, unconditionally declare for streamout and vertex compaction.
621 * Whether space is actually allocated is determined during linking / PM4 creation.
622 */
623 if (is_merged_esgs_stage || is_nogs_ngg_stage)
624 si_llvm_declare_lds_esgs_ring(ctx);
625
626 /* This is really only needed when streamout and / or vertex
627 * compaction is enabled.
628 */
629 if (is_nogs_ngg_stage &&
630 (si_shader_uses_streamout(shader) || shader->key.ge.opt.ngg_culling)) {
631 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
632 ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
633 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch",
634 AC_ADDR_SPACE_LDS),
635 .pointee_type = asi32
636 };
637 LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32));
638 LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
639 }
640
641 /* For merged shaders (VS-TCS, VS-GS, TES-GS): */
642 if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) {
643 /* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper
644 * function does this.
645 */
646 if (ctx->stage == MESA_SHADER_TESS_EVAL) {
647 /* TES has only 1 shader part, therefore it doesn't use the wrapper function. */
648 if (!shader->is_monolithic || !shader->key.ge.as_es)
649 ac_init_exec_full_mask(&ctx->ac);
650 } else if (ctx->stage == MESA_SHADER_VERTEX) {
651 if (shader->is_monolithic) {
652 /* Only mono VS with TCS/GS present has wrapper function. */
653 if (!shader->key.ge.as_ls && !shader->key.ge.as_es)
654 ac_init_exec_full_mask(&ctx->ac);
655 } else {
656 ac_init_exec_full_mask(&ctx->ac);
657 }
658 }
659
660 /* NGG VS and NGG TES: nir ngg lowering send gs_alloc_req at the beginning when culling
661 * is disabled, but GFX10 may hang if not all waves are launched before gs_alloc_req.
662 * We work around this HW bug by inserting a barrier before gs_alloc_req.
663 */
664 if (ctx->screen->info.gfx_level == GFX10 &&
665 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
666 shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling)
667 ac_build_s_barrier(&ctx->ac, ctx->stage);
668
669 LLVMValueRef thread_enabled = NULL;
670
671 if ((ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) ||
672 (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
673 /* Wrap both shaders in an if statement according to the number of enabled threads
674 * there. For monolithic TCS, the if statement is inserted by the wrapper function,
675 * not here. For NGG GS, the if statement is inserted by nir lowering.
676 */
677 thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
678 } else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) {
679 /* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
680 * the if statement is inserted by the wrapper function.
681 */
682 thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
683 }
684
685 if (thread_enabled) {
686 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
687 ctx->merged_wrap_if_label = 11500;
688 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
689 }
690
691 /* Execute a barrier before the second shader in
692 * a merged shader.
693 *
694 * Execute the barrier inside the conditional block,
695 * so that empty waves can jump directly to s_endpgm,
696 * which will also signal the barrier.
697 *
698 * This is possible in gfx9, because an empty wave for the second shader does not insert
699 * any ending. With NGG, empty waves may still be required to export data (e.g. GS output
700 * vertices), so we cannot let them exit early.
701 *
702 * If the shader is TCS and the TCS epilog is present
703 * and contains a barrier, it will wait there and then
704 * reach s_endpgm.
705 */
706 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
707 /* We need the barrier only if TCS inputs are read from LDS. */
708 if (!shader->key.ge.opt.same_patch_vertices ||
709 shader->selector->info.base.inputs_read &
710 ~shader->selector->info.tcs_vgpr_only_inputs) {
711 ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
712
713 /* If both input and output patches are wholly in one wave, we don't need a barrier.
714 * That's true when both VS and TCS have the same number of patch vertices and
715 * the wave size is a multiple of the number of patch vertices.
716 */
717 if (!shader->key.ge.opt.same_patch_vertices ||
718 ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0)
719 ac_build_s_barrier(&ctx->ac, ctx->stage);
720 }
721 } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
722 ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
723 ac_build_s_barrier(&ctx->ac, ctx->stage);
724 }
725 }
726
727 ctx->abi.clamp_shadow_reference = true;
728 ctx->abi.robust_buffer_access = true;
729 ctx->abi.load_grid_size_from_user_sgpr = true;
730 ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero ||
731 info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
732 ctx->abi.disable_aniso_single_level = true;
733
734 bool ls_need_output =
735 ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.as_ls &&
736 shader->key.ge.opt.same_patch_vertices;
737
738 bool tcs_need_output =
739 ctx->stage == MESA_SHADER_TESS_CTRL && info->tessfactors_are_def_in_all_invocs;
740
741 bool ps_need_output = ctx->stage == MESA_SHADER_FRAGMENT;
742
743 if (ls_need_output || tcs_need_output || ps_need_output) {
744 for (unsigned i = 0; i < info->num_outputs; i++) {
745 LLVMTypeRef type = ctx->ac.f32;
746
747 /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
748 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
749 nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
750 type = ctx->ac.f16;
751
752 for (unsigned j = 0; j < 4; j++) {
753 ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
754 ctx->abi.is_16bit[i * 4 + j] = type == ctx->ac.f16;
755 }
756 }
757 }
758
759 if (!ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args->ac, nir))
760 return false;
761
762 switch (ctx->stage) {
763 case MESA_SHADER_VERTEX:
764 if (shader->key.ge.as_ls)
765 si_llvm_ls_build_end(ctx);
766 else if (shader->key.ge.as_es)
767 si_llvm_es_build_end(ctx);
768 break;
769
770 case MESA_SHADER_TESS_CTRL:
771 if (!shader->is_monolithic)
772 si_llvm_tcs_build_end(ctx);
773 break;
774
775 case MESA_SHADER_TESS_EVAL:
776 if (ctx->shader->key.ge.as_es)
777 si_llvm_es_build_end(ctx);
778 break;
779
780 case MESA_SHADER_GEOMETRY:
781 if (!ctx->shader->key.ge.as_ngg)
782 si_llvm_gs_build_end(ctx);
783 break;
784
785 case MESA_SHADER_FRAGMENT:
786 if (!shader->is_monolithic)
787 si_llvm_ps_build_end(ctx);
788 break;
789
790 default:
791 break;
792 }
793
794 si_llvm_build_ret(ctx, ctx->return_value);
795
796 if (free_nir)
797 ralloc_free(nir);
798 return true;
799 }
800
si_should_optimize_less(struct ac_llvm_compiler * compiler,struct si_shader_selector * sel)801 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
802 struct si_shader_selector *sel)
803 {
804 if (!compiler->low_opt_passes)
805 return false;
806
807 /* Assume a slow CPU. */
808 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.gfx_level <= GFX8);
809
810 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
811 * buffer stores. */
812 return sel->stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
813 }
814
si_llvm_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct si_shader_args * args,struct util_debug_callback * debug,struct nir_shader * nir)815 bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
816 struct si_shader *shader, struct si_shader_args *args,
817 struct util_debug_callback *debug, struct nir_shader *nir)
818 {
819 struct si_shader_selector *sel = shader->selector;
820 struct si_shader_context ctx;
821 enum ac_float_mode float_mode = nir->info.stage == MESA_SHADER_KERNEL ? AC_FLOAT_MODE_DEFAULT : AC_FLOAT_MODE_DEFAULT_OPENGL;
822 bool exports_color_null = false;
823 bool exports_mrtz = false;
824
825 if (sel->stage == MESA_SHADER_FRAGMENT) {
826 exports_color_null = sel->info.colors_written;
827 exports_mrtz = sel->info.writes_z || sel->info.writes_stencil || shader->ps.writes_samplemask;
828 if (!exports_mrtz && !exports_color_null)
829 exports_color_null = si_shader_uses_discard(shader) || sscreen->info.gfx_level < GFX10;
830 }
831
832 si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size, exports_color_null, exports_mrtz,
833 float_mode);
834 ctx.args = args;
835
836 if (!si_llvm_translate_nir(&ctx, shader, nir, false)) {
837 si_llvm_dispose(&ctx);
838 return false;
839 }
840
841 /* For merged shader stage. */
842 if (shader->is_monolithic && sscreen->info.gfx_level >= GFX9 &&
843 (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_GEOMETRY)) {
844 /* LS or ES shader. */
845 struct si_shader prev_shader = {};
846
847 bool free_nir;
848 nir = si_get_prev_stage_nir_shader(shader, &prev_shader, ctx.args, &free_nir);
849
850 struct ac_llvm_pointer parts[2];
851 parts[1] = ctx.main_fn;
852
853 if (!si_llvm_translate_nir(&ctx, &prev_shader, nir, free_nir)) {
854 si_llvm_dispose(&ctx);
855 return false;
856 }
857
858 parts[0] = ctx.main_fn;
859
860 /* Reset the shader context. */
861 ctx.shader = shader;
862 ctx.stage = sel->stage;
863
864 bool same_thread_count = shader->key.ge.opt.same_patch_vertices;
865 si_build_wrapper_function(&ctx, parts, same_thread_count);
866 }
867
868 si_llvm_optimize_module(&ctx);
869
870 /* Make sure the input is a pointer and not integer followed by inttoptr. */
871 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn.value, 0))) == LLVMPointerTypeKind);
872
873 /* Compile to bytecode. */
874 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
875 sel->stage, si_get_shader_name(shader),
876 si_should_optimize_less(compiler, shader->selector))) {
877 si_llvm_dispose(&ctx);
878 fprintf(stderr, "LLVM failed to compile shader\n");
879 return false;
880 }
881
882 si_llvm_dispose(&ctx);
883 return true;
884 }
885
si_llvm_build_shader_part(struct si_screen * sscreen,gl_shader_stage stage,bool prolog,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name,struct si_shader_part * result)886 bool si_llvm_build_shader_part(struct si_screen *sscreen, gl_shader_stage stage,
887 bool prolog, struct ac_llvm_compiler *compiler,
888 struct util_debug_callback *debug, const char *name,
889 struct si_shader_part *result)
890 {
891 union si_shader_part_key *key = &result->key;
892
893 struct si_shader_selector sel = {};
894 sel.screen = sscreen;
895
896 struct si_shader shader = {};
897 shader.selector = &sel;
898 bool wave32 = false;
899 bool exports_color_null = false;
900 bool exports_mrtz = false;
901
902 switch (stage) {
903 case MESA_SHADER_TESS_CTRL:
904 assert(!prolog);
905 shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
906 wave32 = key->tcs_epilog.wave32;
907 break;
908 case MESA_SHADER_FRAGMENT:
909 if (prolog) {
910 shader.key.ps.part.prolog = key->ps_prolog.states;
911 wave32 = key->ps_prolog.wave32;
912 exports_color_null = key->ps_prolog.states.poly_stipple;
913 } else {
914 shader.key.ps.part.epilog = key->ps_epilog.states;
915 wave32 = key->ps_epilog.wave32;
916 exports_color_null = key->ps_epilog.colors_written;
917 exports_mrtz = key->ps_epilog.writes_z || key->ps_epilog.writes_stencil ||
918 key->ps_epilog.writes_samplemask;
919 if (!exports_mrtz && !exports_color_null)
920 exports_color_null = key->ps_epilog.uses_discard || sscreen->info.gfx_level < GFX10;
921 }
922 break;
923 default:
924 unreachable("bad shader part");
925 }
926
927 struct si_shader_context ctx;
928 si_llvm_context_init(&ctx, sscreen, compiler, wave32 ? 32 : 64, exports_color_null, exports_mrtz,
929 AC_FLOAT_MODE_DEFAULT_OPENGL);
930
931 ctx.shader = &shader;
932 ctx.stage = stage;
933
934 struct si_shader_args args;
935 ctx.args = &args;
936
937 void (*build)(struct si_shader_context *, union si_shader_part_key *);
938
939 switch (stage) {
940 case MESA_SHADER_TESS_CTRL:
941 build = si_llvm_build_tcs_epilog;
942 break;
943 case MESA_SHADER_FRAGMENT:
944 build = prolog ? si_llvm_build_ps_prolog : si_llvm_build_ps_epilog;
945 break;
946 default:
947 unreachable("bad shader part");
948 }
949
950 build(&ctx, key);
951
952 /* Compile. */
953 si_llvm_optimize_module(&ctx);
954
955 bool ret = si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
956 &ctx.ac, debug, ctx.stage, name, false);
957
958 si_llvm_dispose(&ctx);
959 return ret;
960 }
961