1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21
22 #include <assert.h>
23 #include <stdio.h>
24
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28 */
29 struct ac_llvm_flow {
30 /* Loop exit or next part of if/else/endif. */
31 LLVMBasicBlockRef next_block;
32 LLVMBasicBlockRef loop_entry_block;
33 };
34
35 /* Initialize module-independent parts of the context.
36 *
37 * The caller is responsible for initializing ctx::module and ctx::builder.
38 */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40 const struct radeon_info *info, enum ac_float_mode float_mode,
41 unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42 bool exports_mrtz)
43 {
44 ctx->context = LLVMContextCreate();
45
46 ctx->info = info;
47 ctx->gfx_level = info->gfx_level;
48 ctx->wave_size = wave_size;
49 ctx->ballot_mask_bits = ballot_mask_bits;
50 ctx->float_mode = float_mode;
51 ctx->exports_color_null = exports_color_null;
52 ctx->exports_mrtz = exports_mrtz;
53 ctx->module = ac_create_module(compiler->tm, ctx->context);
54 ctx->builder = ac_create_builder(ctx->context, float_mode);
55
56 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63 ctx->intptr = ctx->i32;
64 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67 ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81
82 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98
99 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101
102 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106
107 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108
109 LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110 ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111
112 ctx->flow = calloc(1, sizeof(*ctx->flow));
113
114 ctx->ring_offsets_index = INT32_MAX;
115 }
116
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119 free(ctx->flow->stack);
120 free(ctx->flow);
121 ctx->flow = NULL;
122
123 LLVMDisposeBuilder(ctx->builder);
124 }
125
ac_get_llvm_num_components(LLVMValueRef value)126 int ac_get_llvm_num_components(LLVMValueRef value)
127 {
128 LLVMTypeRef type = LLVMTypeOf(value);
129 unsigned num_components =
130 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
131 return num_components;
132 }
133
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)134 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
135 {
136 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
137 assert(index == 0);
138 return value;
139 }
140
141 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
142 }
143
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)144 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
145 {
146 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
147 type = LLVMGetElementType(type);
148
149 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
150 return LLVMGetIntTypeWidth(type);
151
152 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
153 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
154 return 32;
155 }
156
157 if (type == ctx->f16)
158 return 16;
159 if (type == ctx->f32)
160 return 32;
161 if (type == ctx->f64)
162 return 64;
163
164 unreachable("Unhandled type kind in get_elem_bits");
165 }
166
ac_get_type_size(LLVMTypeRef type)167 unsigned ac_get_type_size(LLVMTypeRef type)
168 {
169 LLVMTypeKind kind = LLVMGetTypeKind(type);
170
171 switch (kind) {
172 case LLVMIntegerTypeKind:
173 return LLVMGetIntTypeWidth(type) / 8;
174 case LLVMHalfTypeKind:
175 return 2;
176 case LLVMFloatTypeKind:
177 return 4;
178 case LLVMDoubleTypeKind:
179 return 8;
180 case LLVMPointerTypeKind:
181 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
182 return 4;
183 return 8;
184 case LLVMVectorTypeKind:
185 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
186 case LLVMArrayTypeKind:
187 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
188 default:
189 assert(0);
190 return 0;
191 }
192 }
193
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)194 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
195 {
196 if (t == ctx->i1)
197 return ctx->i1;
198 else if (t == ctx->i8)
199 return ctx->i8;
200 else if (t == ctx->f16 || t == ctx->i16)
201 return ctx->i16;
202 else if (t == ctx->f32 || t == ctx->i32)
203 return ctx->i32;
204 else if (t == ctx->f64 || t == ctx->i64)
205 return ctx->i64;
206 else
207 unreachable("Unhandled integer size");
208 }
209
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)210 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
211 {
212 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
213 LLVMTypeRef elem_type = LLVMGetElementType(t);
214 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
215 }
216 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
217 switch (LLVMGetPointerAddressSpace(t)) {
218 case AC_ADDR_SPACE_GLOBAL:
219 case AC_ADDR_SPACE_CONST:
220 return ctx->i64;
221 case AC_ADDR_SPACE_CONST_32BIT:
222 case AC_ADDR_SPACE_LDS:
223 return ctx->i32;
224 default:
225 unreachable("unhandled address space");
226 }
227 }
228 return to_integer_type_scalar(ctx, t);
229 }
230
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)231 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
232 {
233 LLVMTypeRef type = LLVMTypeOf(v);
234 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
235 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236 }
237 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
238 }
239
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242 LLVMTypeRef type = LLVMTypeOf(v);
243 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
244 return v;
245 return ac_to_integer(ctx, v);
246 }
247
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)248 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
249 {
250 if (t == ctx->i8)
251 return ctx->i8;
252 else if (t == ctx->i16 || t == ctx->f16)
253 return ctx->f16;
254 else if (t == ctx->i32 || t == ctx->f32)
255 return ctx->f32;
256 else if (t == ctx->i64 || t == ctx->f64)
257 return ctx->f64;
258 else
259 unreachable("Unhandled float size");
260 }
261
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)262 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
263 {
264 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
265 LLVMTypeRef elem_type = LLVMGetElementType(t);
266 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
267 }
268 return to_float_type_scalar(ctx, t);
269 }
270
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)271 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
272 {
273 LLVMTypeRef type = LLVMTypeOf(v);
274 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
275 }
276
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)277 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
278 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
279 unsigned attrib_mask)
280 {
281 LLVMValueRef call;
282
283 LLVMTypeRef param_types[32];
284 assert(param_count <= 32);
285 for (unsigned i = 0; i < param_count; ++i) {
286 assert(params[i]);
287 param_types[i] = LLVMTypeOf(params[i]);
288 }
289
290 LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
291 LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
292
293 if (!function) {
294 function = LLVMAddFunction(ctx->module, name, function_type);
295
296 LLVMSetFunctionCallConv(function, LLVMCCallConv);
297 LLVMSetLinkage(function, LLVMExternalLinkage);
298 }
299
300 call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
301
302 if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
303 LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
304
305 if (attrib_mask & AC_ATTR_CONVERGENT)
306 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
307
308 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
309 return call;
310 }
311
312 /**
313 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
314 * intrinsic names).
315 */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)316 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
317 {
318 LLVMTypeRef elem_type = type;
319
320 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
321 unsigned count = LLVMCountStructElementTypes(type);
322 int ret = snprintf(buf, bufsize, "sl_");
323 buf += ret;
324 bufsize -= ret;
325
326 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
327 LLVMGetStructElementTypes(type, elems);
328
329 for (unsigned i = 0; i < count; i++) {
330 ac_build_type_name_for_intr(elems[i], buf, bufsize);
331 ret = strlen(buf);
332 buf += ret;
333 bufsize -= ret;
334 }
335
336 snprintf(buf, bufsize, "s");
337 return;
338 }
339
340 assert(bufsize >= 8);
341 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
342 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
343 if (ret < 0) {
344 char *type_name = LLVMPrintTypeToString(type);
345 fprintf(stderr, "Error building type name for: %s\n", type_name);
346 LLVMDisposeMessage(type_name);
347 return;
348 }
349 elem_type = LLVMGetElementType(type);
350 buf += ret;
351 bufsize -= ret;
352 }
353 switch (LLVMGetTypeKind(elem_type)) {
354 default:
355 break;
356 case LLVMIntegerTypeKind:
357 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
358 break;
359 case LLVMHalfTypeKind:
360 snprintf(buf, bufsize, "f16");
361 break;
362 case LLVMFloatTypeKind:
363 snprintf(buf, bufsize, "f32");
364 break;
365 case LLVMDoubleTypeKind:
366 snprintf(buf, bufsize, "f64");
367 break;
368 }
369 }
370
371 /**
372 * Helper function that builds an LLVM IR PHI node and immediately adds
373 * incoming edges.
374 */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)375 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
376 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
377 {
378 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
379 LLVMAddIncoming(phi, values, blocks, count_incoming);
380 return phi;
381 }
382
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)383 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
384 {
385 /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
386 * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
387 */
388 if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
389 return;
390
391 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
392 }
393
394 /* Prevent optimizations (at least of memory accesses) across the current
395 * point in the program by emitting empty inline assembly that is marked as
396 * having side effects.
397 *
398 * Optionally, a value can be passed through the inline assembly to prevent
399 * LLVM from hoisting calls to ReadNone functions.
400 */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)401 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
402 {
403 static int counter = 0;
404
405 LLVMBuilderRef builder = ctx->builder;
406 char code[16];
407 const char *constraint = sgpr ? "=s,0" : "=v,0";
408
409 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
410
411 if (!pgpr) {
412 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
413 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
414 LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
415 } else {
416 LLVMTypeRef old_type = LLVMTypeOf(*pgpr);
417
418 if (old_type == ctx->i1)
419 *pgpr = LLVMBuildZExt(builder, *pgpr, ctx->i32, "");
420
421 if (old_type == LLVMVectorType(ctx->i16, 3))
422 *pgpr = ac_build_expand_to_vec4(ctx, *pgpr, 4);
423
424 LLVMTypeRef type = LLVMTypeOf(*pgpr);
425 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
426 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
427
428 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
429
430 if (old_type == ctx->i1)
431 *pgpr = LLVMBuildTrunc(builder, *pgpr, old_type, "");
432
433 if (old_type == LLVMVectorType(ctx->i16, 3))
434 *pgpr = ac_extract_components(ctx, *pgpr, 0, 3);
435 }
436 }
437
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)438 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
439 {
440 if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
441 const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
442 LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
443 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
444 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
445 }
446
447 const char *subgroup = "llvm.readcyclecounter";
448 const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
449
450 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
451 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
452 }
453
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)454 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
455 {
456 const char *name;
457
458 if (LLVMTypeOf(value) == ctx->i1)
459 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
460
461 if (ctx->wave_size == 64)
462 name = "llvm.amdgcn.icmp.i64.i32";
463 else
464 name = "llvm.amdgcn.icmp.i32.i32";
465
466 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
467
468 /* We currently have no other way to prevent LLVM from lifting the icmp
469 * calls to a dominating basic block.
470 */
471 ac_build_optimization_barrier(ctx, &args[0], false);
472
473 args[0] = ac_to_integer(ctx, args[0]);
474
475 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
476 }
477
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)478 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
479 {
480 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
481 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
482 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
483 }
484
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)485 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
486 {
487 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
488 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
489 "");
490 }
491
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)492 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
493 unsigned value_count, unsigned component)
494 {
495 LLVMValueRef vec = NULL;
496
497 if (value_count == 1) {
498 return values[component];
499 } else if (!value_count)
500 unreachable("value_count is 0");
501
502 for (unsigned i = component; i < value_count + component; i++) {
503 LLVMValueRef value = values[i];
504
505 if (i == component)
506 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
507 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
508 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
509 }
510 return vec;
511 }
512
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)513 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
514 unsigned value_count, unsigned value_stride,
515 bool always_vector)
516 {
517 LLVMBuilderRef builder = ctx->builder;
518 LLVMValueRef vec = NULL;
519 unsigned i;
520
521 if (value_count == 1 && !always_vector) {
522 return values[0];
523 } else if (!value_count)
524 unreachable("value_count is 0");
525
526 for (i = 0; i < value_count; i++) {
527 LLVMValueRef value = values[i * value_stride];
528
529 if (!i)
530 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
531 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
532 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
533 }
534 return vec;
535 }
536
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)537 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
538 unsigned value_count)
539 {
540 return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
541 }
542
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)543 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
544 {
545 if (!a)
546 return b;
547
548 unsigned a_size = ac_get_llvm_num_components(a);
549 unsigned b_size = ac_get_llvm_num_components(b);
550
551 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
552 for (unsigned i = 0; i < a_size; i++)
553 elems[i] = ac_llvm_extract_elem(ctx, a, i);
554 for (unsigned i = 0; i < b_size; i++)
555 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
556
557 return ac_build_gather_values(ctx, elems, a_size + b_size);
558 }
559
560 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
561 * channels with undef. Extract at most src_channels components from the input.
562 */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)563 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
564 unsigned src_channels, unsigned dst_channels)
565 {
566 LLVMTypeRef elemtype;
567 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
568
569 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
570 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
571
572 if (src_channels == dst_channels && vec_size == dst_channels)
573 return value;
574
575 src_channels = MIN2(src_channels, vec_size);
576
577 for (unsigned i = 0; i < src_channels; i++)
578 chan[i] = ac_llvm_extract_elem(ctx, value, i);
579
580 elemtype = LLVMGetElementType(LLVMTypeOf(value));
581 } else {
582 if (src_channels) {
583 assert(src_channels == 1);
584 chan[0] = value;
585 }
586 elemtype = LLVMTypeOf(value);
587 }
588
589 for (unsigned i = src_channels; i < dst_channels; i++)
590 chan[i] = LLVMGetUndef(elemtype);
591
592 return ac_build_gather_values(ctx, chan, dst_channels);
593 }
594
595 /* Extract components [start, start + channels) from a vector.
596 */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)597 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
598 unsigned channels)
599 {
600 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
601
602 for (unsigned i = 0; i < channels; i++)
603 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
604
605 return ac_build_gather_values(ctx, chan, channels);
606 }
607
608 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
609 * with undef. Extract at most num_channels components from the input.
610 */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)611 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
612 unsigned num_channels)
613 {
614 return ac_build_expand(ctx, value, num_channels, 4);
615 }
616
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)617 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
618 {
619 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
620 const char *name;
621
622 if (type_size == 2)
623 name = "llvm.amdgcn.rcp.f16";
624 else if (type_size == 4)
625 name = "llvm.amdgcn.rcp.f32";
626 else
627 name = "llvm.amdgcn.rcp.f64";
628
629 LLVMValueRef rcp =
630 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
631
632 return LLVMBuildFMul(ctx->builder, num, rcp, "");
633 }
634
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)635 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
636 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
637 LLVMValueRef j)
638 {
639 LLVMValueRef args[5];
640
641 if (ctx->gfx_level >= GFX11) {
642 LLVMValueRef p;
643 LLVMValueRef p10;
644
645 args[0] = llvm_chan;
646 args[1] = attr_number;
647 args[2] = params;
648
649 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
650 ctx->f32, args, 3, 0);
651
652 args[0] = p;
653 args[1] = i;
654 args[2] = p;
655
656 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
657 ctx->f32, args, 3, 0);
658
659 args[0] = p;
660 args[1] = j;
661 args[2] = p10;
662
663 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
664 ctx->f32, args, 3, 0);
665
666 } else {
667 LLVMValueRef p1;
668
669 args[0] = i;
670 args[1] = llvm_chan;
671 args[2] = attr_number;
672 args[3] = params;
673
674 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
675 ctx->f32, args, 4, 0);
676
677 args[0] = p1;
678 args[1] = j;
679 args[2] = llvm_chan;
680 args[3] = attr_number;
681 args[4] = params;
682
683 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
684 ctx->f32, args, 5, 0);
685 }
686 }
687
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)688 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
689 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
690 LLVMValueRef j, bool high_16bits)
691 {
692 LLVMValueRef args[6];
693
694 if (ctx->gfx_level >= GFX11) {
695 LLVMValueRef p;
696 LLVMValueRef p10;
697
698 args[0] = llvm_chan;
699 args[1] = attr_number;
700 args[2] = params;
701
702 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
703 ctx->f32, args, 3, 0);
704
705 args[0] = p;
706 args[1] = i;
707 args[2] = p;
708 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
709
710 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
711 ctx->f32, args, 4, 0);
712
713 args[0] = p;
714 args[1] = j;
715 args[2] = p10;
716 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
717
718 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
719 ctx->f16, args, 4, 0);
720
721 } else {
722 LLVMValueRef p1;
723
724 args[0] = i;
725 args[1] = llvm_chan;
726 args[2] = attr_number;
727 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
728 args[4] = params;
729
730 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
731 0);
732
733 args[0] = p1;
734 args[1] = j;
735 args[2] = llvm_chan;
736 args[3] = attr_number;
737 args[4] = high_16bits ? ctx->i1true : ctx->i1false;
738 args[5] = params;
739
740 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
741 0);
742 }
743 }
744
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)745 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
746 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
747 LLVMValueRef params)
748 {
749 LLVMValueRef args[4];
750
751 if (ctx->gfx_level >= GFX11) {
752 LLVMValueRef p;
753
754 args[0] = llvm_chan;
755 args[1] = attr_number;
756 args[2] = params;
757
758 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
759 ctx->f32, args, 3, 0);
760 p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
761 p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter, true);
762 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
763 } else {
764 args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
765 args[1] = llvm_chan;
766 args[2] = attr_number;
767 args[3] = params;
768
769 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
770 }
771 }
772
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)773 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
774 {
775 LLVMValueRef indices[2] = {
776 ctx->i32_0,
777 index,
778 };
779
780 return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
781 }
782
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)783 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
784 LLVMValueRef value)
785 {
786 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
787 }
788
789 /**
790 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
791 * It's equivalent to doing a load from &base_ptr[index].
792 *
793 * \param base_ptr Where the array starts.
794 * \param index The element index into the array.
795 * \param uniform Whether the base_ptr and index can be assumed to be
796 * dynamically uniform (i.e. load to an SGPR)
797 * \param invariant Whether the load is invariant (no other opcodes affect it)
798 * \param no_unsigned_wraparound
799 * For all possible re-associations and re-distributions of an expression
800 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
801 * without inbounds in base_ptr), this parameter is true if "addr + offset"
802 * does not result in an unsigned integer wraparound. This is used for
803 * optimal code generation of 32-bit pointer arithmetic.
804 *
805 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
806 * integer wraparound can't be an imm offset in s_load_dword, because
807 * the instruction performs "addr + offset" in 64 bits.
808 *
809 * Expected usage for bindless textures by chaining GEPs:
810 * // possible unsigned wraparound, don't use InBounds:
811 * ptr1 = LLVMBuildGEP(base_ptr, index);
812 * image = load(ptr1); // becomes "s_load ptr1, 0"
813 *
814 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
815 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
816 */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)817 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
818 LLVMValueRef base_ptr, LLVMValueRef index,
819 bool uniform, bool invariant, bool no_unsigned_wraparound)
820 {
821 LLVMValueRef pointer, result;
822
823 if (no_unsigned_wraparound &&
824 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
825 pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
826 else
827 pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
828
829 if (uniform)
830 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
831 result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
832 if (invariant)
833 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
834 LLVMSetAlignment(result, 4);
835 return result;
836 }
837
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)838 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
839 LLVMValueRef index)
840 {
841 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
842 }
843
844 /* This assumes that there is no unsigned integer wraparound during the address
845 * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)846 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
847 LLVMValueRef index)
848 {
849 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
850 }
851
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)852 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
853 {
854 return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
855 }
856
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)857 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
858 LLVMValueRef data, LLVMValueRef vindex,
859 LLVMValueRef voffset, LLVMValueRef soffset,
860 enum gl_access_qualifier access, bool use_format)
861 {
862 LLVMValueRef args[6];
863 int idx = 0;
864 args[idx++] = data;
865 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
866 if (vindex)
867 args[idx++] = vindex ? vindex : ctx->i32_0;
868 args[idx++] = voffset ? voffset : ctx->i32_0;
869 args[idx++] = soffset ? soffset : ctx->i32_0;
870 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
871 const char *indexing_kind = vindex ? "struct" : "raw";
872 char name[256], type_name[8];
873
874 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
875
876 if (use_format) {
877 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
878 type_name);
879 } else {
880 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
881 }
882
883 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
884 }
885
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)886 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
887 LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
888 {
889 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
890 }
891
892 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)893 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
894 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
895 enum gl_access_qualifier access)
896 {
897 unsigned num_channels = ac_get_llvm_num_components(vdata);
898
899 /* Split 3 channel stores if unsupported. */
900 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
901 LLVMValueRef v[3], v01, voffset2;
902
903 for (int i = 0; i < 3; i++) {
904 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
905 }
906 v01 = ac_build_gather_values(ctx, v, 2);
907
908 voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
909 LLVMConstInt(ctx->i32, 8, 0), "");
910
911 ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
912 ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
913 return;
914 }
915
916 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
917 access, false);
918 }
919
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)920 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
921 LLVMValueRef vindex, LLVMValueRef voffset,
922 LLVMValueRef soffset, unsigned num_channels,
923 LLVMTypeRef channel_type, enum gl_access_qualifier access,
924 bool can_speculate, bool use_format)
925 {
926 LLVMValueRef args[5];
927 int idx = 0;
928 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
929 if (vindex)
930 args[idx++] = vindex;
931 args[idx++] = voffset ? voffset : ctx->i32_0;
932 args[idx++] = soffset ? soffset : ctx->i32_0;
933 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
934 unsigned func =
935 !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
936 const char *indexing_kind = vindex ? "struct" : "raw";
937 char name[256], type_name[8];
938
939 /* D16 is only supported on gfx8+ */
940 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
941 ctx->gfx_level >= GFX8);
942
943 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
944 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
945
946 if (use_format) {
947 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
948 type_name);
949 } else {
950 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
951 }
952
953 LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
954 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
955 if (func > num_channels)
956 result = ac_trim_vector(ctx, result, num_channels);
957 return result;
958 }
959
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)960 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
961 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
962 LLVMTypeRef channel_type, enum gl_access_qualifier access,
963 bool can_speculate, bool allow_smem)
964 {
965 if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
966 assert(vindex == NULL);
967
968 LLVMValueRef result[32];
969
970 LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
971 if (soffset)
972 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
973
974 char name[256], type_name[8];
975 ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
976 snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
977
978 LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
979
980 for (int i = 0; i < num_channels; i++) {
981 if (i) {
982 offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
983 }
984 LLVMValueRef args[3] = {
985 rsrc,
986 offset,
987 LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
988 ACCESS_TYPE_SMEM), 0),
989 };
990 result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
991 }
992 if (num_channels == 1)
993 return result[0];
994
995 return ac_build_gather_values(ctx, result, num_channels);
996 }
997
998 /* LLVM is unable to select instructions for num_channels > 4, so we
999 * workaround that by manually splitting larger buffer loads.
1000 */
1001 LLVMValueRef result = NULL;
1002 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1003 fetch_num_channels = MIN2(4, num_channels - i);
1004 LLVMValueRef fetch_voffset =
1005 LLVMBuildAdd(ctx->builder, voffset,
1006 LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1007 LLVMValueRef item =
1008 ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1009 channel_type, access, can_speculate, false);
1010 result = ac_build_concat(ctx, result, item);
1011 }
1012
1013 return result;
1014 }
1015
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1016 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1017 LLVMValueRef vindex, LLVMValueRef voffset,
1018 unsigned num_channels, enum gl_access_qualifier access,
1019 bool can_speculate, bool d16, bool tfe)
1020 {
1021 if (tfe) {
1022 assert(!d16);
1023
1024 union ac_hw_cache_flags cache_flags =
1025 ac_get_hw_cache_flags(ctx->gfx_level, access | ACCESS_TYPE_LOAD);
1026 char code[1024];
1027
1028 /* The definition in the assembly and the one in the constraint string
1029 * differs because of an assembler bug.
1030 */
1031 if (ctx->gfx_level >= GFX12) {
1032 const char *scope = "";
1033 const char *temporal_hint = "";
1034
1035 if (cache_flags.gfx12.scope == gfx12_scope_se)
1036 scope = "scope:SCOPE_SE";
1037 else if (cache_flags.gfx12.scope == gfx12_scope_device)
1038 scope = "scope:SCOPE_DEV";
1039 else if (cache_flags.gfx12.scope == gfx12_scope_memory)
1040 scope = "scope:SCOPE_SYS";
1041
1042 if (cache_flags.gfx12.temporal_hint == gfx12_load_non_temporal)
1043 temporal_hint = "th:TH_LOAD_NT";
1044 else if (cache_flags.gfx12.temporal_hint == gfx12_load_high_temporal)
1045 temporal_hint = "th:TH_LOAD_HT";
1046 else if (cache_flags.gfx12.temporal_hint == gfx12_load_last_use_discard)
1047 temporal_hint = "th:TH_LOAD_LU";
1048 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_regular_temporal)
1049 temporal_hint = "th:TH_LOAD_NT_RT";
1050 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_regular_temporal_far_non_temporal)
1051 temporal_hint = "th:TH_LOAD_RT_NT";
1052 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_high_temporal)
1053 temporal_hint = "th:TH_LOAD_NT_HT";
1054
1055 snprintf(code, sizeof(code),
1056 "v_mov_b32 v0, 0\n"
1057 "v_mov_b32 v1, 0\n"
1058 "v_mov_b32 v2, 0\n"
1059 "v_mov_b32 v3, 0\n"
1060 "v_mov_b32 v4, 0\n"
1061 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe\n"
1062 "s_waitcnt vmcnt(0)",
1063 temporal_hint, scope);
1064 } else {
1065 snprintf(code, sizeof(code),
1066 "v_mov_b32 v0, 0\n"
1067 "v_mov_b32 v1, 0\n"
1068 "v_mov_b32 v2, 0\n"
1069 "v_mov_b32 v3, 0\n"
1070 "v_mov_b32 v4, 0\n"
1071 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1072 "s_waitcnt vmcnt(0)",
1073 cache_flags.value & ac_glc ? "glc" : "",
1074 cache_flags.value & ac_slc ? "slc" : "",
1075 cache_flags.value & ac_dlc ? "dlc" : "");
1076 }
1077
1078 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1079 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1080 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1081
1082 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1083 voffset ? voffset : ctx->i32_0};
1084
1085 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1086 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1087 LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1088
1089 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1090 ac_llvm_extract_elem(ctx, res, 4));
1091 }
1092
1093 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1094 num_channels, d16 ? ctx->f16 : ctx->f32, access,
1095 can_speculate, true);
1096 }
1097
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1098 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1099 LLVMValueRef vindex, LLVMValueRef voffset,
1100 LLVMValueRef soffset, unsigned num_channels,
1101 unsigned tbuffer_format, LLVMTypeRef channel_type,
1102 enum gl_access_qualifier access, bool can_speculate)
1103 {
1104 LLVMValueRef args[6];
1105 int idx = 0;
1106 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1107 if (vindex)
1108 args[idx++] = vindex;
1109 args[idx++] = voffset ? voffset : ctx->i32_0;
1110 args[idx++] = soffset ? soffset : ctx->i32_0;
1111 args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1112 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1113 const char *indexing_kind = vindex ? "struct" : "raw";
1114 char name[256], type_name[8];
1115
1116 LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1117 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1118
1119 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1120
1121 return ac_build_intrinsic(ctx, name, type, args, idx,
1122 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1123 }
1124
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1125 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1126 LLVMValueRef vidx, LLVMValueRef base_voffset,
1127 LLVMValueRef soffset,
1128 const enum pipe_format format,
1129 unsigned channel_bit_size,
1130 unsigned const_offset,
1131 unsigned align_offset,
1132 unsigned align_mul,
1133 unsigned num_channels,
1134 enum gl_access_qualifier access,
1135 bool can_speculate)
1136 {
1137 const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1138 const unsigned max_channels = vtx_info->num_channels;
1139 LLVMValueRef voffset_plus_const =
1140 LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1141
1142 /* Split the specified load into several MTBUF instructions,
1143 * according to a safe fetch size determined by aligmnent information.
1144 */
1145 LLVMValueRef result = NULL;
1146 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1147 /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1148 assert(i == 0 || vtx_info->chan_byte_size);
1149
1150 const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1151 const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1152 const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1153
1154 fetch_num_channels =
1155 ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1156 max_channels - i, fetch_alignment, num_channels - i);
1157 const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1158 LLVMValueRef fetch_voffset =
1159 LLVMBuildAdd(ctx->builder, voffset_plus_const,
1160 LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1161 LLVMValueRef item =
1162 ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1163 fetch_num_channels, fetch_format, ctx->i32,
1164 access, can_speculate);
1165 result = ac_build_concat(ctx, result, item);
1166 }
1167
1168 /*
1169 * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1170 * manually truncate them to the required size.
1171 * TODO: Do this in NIR instead.
1172 */
1173 const struct util_format_description *desc = util_format_description(format);
1174 bool is_float = !desc->channel[0].pure_integer;
1175
1176 if (channel_bit_size == 16) {
1177 LLVMValueRef channels[4];
1178 for (unsigned i = 0; i < num_channels; i++) {
1179 LLVMValueRef channel = result;
1180 if (num_channels > 1)
1181 channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1182
1183 if (is_float) {
1184 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1185 channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1186 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1187 } else {
1188 channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1189 }
1190 channels[i] = channel;
1191 }
1192 result = ac_build_gather_values(ctx, channels, num_channels);
1193 }
1194
1195 return result;
1196 }
1197
1198
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1199 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1200 LLVMValueRef voffset, LLVMValueRef soffset,
1201 enum gl_access_qualifier access)
1202 {
1203 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1204 access, false, false);
1205 }
1206
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1207 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1208 LLVMValueRef voffset, LLVMValueRef soffset,
1209 enum gl_access_qualifier access)
1210 {
1211 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1212 false, false);
1213 }
1214
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1215 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1216 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1217 enum gl_access_qualifier access)
1218 {
1219 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1220
1221 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1222 }
1223
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1224 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1225 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1226 {
1227 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1228
1229 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1230 }
1231
1232 /**
1233 * Set range metadata on an instruction. This can only be used on load and
1234 * call instructions. If you know an instruction can only produce the values
1235 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1236 * \p lo is the minimum value inclusive.
1237 * \p hi is the maximum value exclusive.
1238 */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1239 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1240 unsigned hi)
1241 {
1242 LLVMValueRef range_md, md_args[2];
1243 LLVMTypeRef type = LLVMTypeOf(value);
1244 LLVMContextRef context = LLVMGetTypeContext(type);
1245
1246 md_args[0] = LLVMConstInt(type, lo, false);
1247 md_args[1] = LLVMConstInt(type, hi, false);
1248 range_md = LLVMMDNodeInContext(context, md_args, 2);
1249 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1250 }
1251
ac_get_thread_id(struct ac_llvm_context * ctx)1252 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1253 {
1254 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1255 }
1256
1257 /*
1258 * AMD GCN implements derivatives using the local data store (LDS)
1259 * All writes to the LDS happen in all executing threads at
1260 * the same time. TID is the Thread ID for the current
1261 * thread and is a value between 0 and 63, representing
1262 * the thread's position in the wavefront.
1263 *
1264 * For the pixel shader threads are grouped into quads of four pixels.
1265 * The TIDs of the pixels of a quad are:
1266 *
1267 * +------+------+
1268 * |4n + 0|4n + 1|
1269 * +------+------+
1270 * |4n + 2|4n + 3|
1271 * +------+------+
1272 *
1273 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1274 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1275 * the current pixel's column, and masking with 0xfffffffe yields the TID
1276 * of the left pixel of the current pixel's row.
1277 *
1278 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1279 * adding 2 yields the TID of the pixel below the top pixel.
1280 */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1281 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1282 {
1283 unsigned tl_lanes[4], trbl_lanes[4];
1284 char name[32], type[8];
1285 LLVMValueRef tl, trbl;
1286 LLVMTypeRef result_type;
1287 LLVMValueRef result;
1288
1289 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1290
1291 if (result_type == ctx->f16)
1292 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1293 else if (result_type == ctx->v2f16)
1294 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1295
1296 for (unsigned i = 0; i < 4; ++i) {
1297 tl_lanes[i] = i & mask;
1298 trbl_lanes[i] = (i & mask) + idx;
1299 }
1300
1301 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3], false);
1302 trbl =
1303 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3], false);
1304
1305 if (result_type == ctx->f16) {
1306 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1307 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1308 }
1309
1310 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1311 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1312 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1313
1314 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1315 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1316
1317 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1318 }
1319
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1320 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1321 {
1322 LLVMValueRef args[2];
1323 args[0] = LLVMConstInt(ctx->i32, imm, false);
1324 args[1] = m0_content;
1325 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1326 }
1327
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1328 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1329 {
1330 LLVMValueRef msb =
1331 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1332
1333 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1334 * the index from LSB. Invert it by doing "31 - msb". */
1335 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1336
1337 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1338 LLVMValueRef cond =
1339 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1340 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1341
1342 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1343 }
1344
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1345 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1346 bool rev)
1347 {
1348 const char *intrin_name;
1349 LLVMTypeRef type;
1350 LLVMValueRef highest_bit;
1351 LLVMValueRef zero;
1352 unsigned bitsize;
1353
1354 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1355 switch (bitsize) {
1356 case 64:
1357 intrin_name = "llvm.ctlz.i64";
1358 type = ctx->i64;
1359 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1360 zero = ctx->i64_0;
1361 break;
1362 case 32:
1363 intrin_name = "llvm.ctlz.i32";
1364 type = ctx->i32;
1365 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1366 zero = ctx->i32_0;
1367 break;
1368 case 16:
1369 intrin_name = "llvm.ctlz.i16";
1370 type = ctx->i16;
1371 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1372 zero = ctx->i16_0;
1373 break;
1374 case 8:
1375 intrin_name = "llvm.ctlz.i8";
1376 type = ctx->i8;
1377 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1378 zero = ctx->i8_0;
1379 break;
1380 default:
1381 unreachable("invalid bitsize");
1382 break;
1383 }
1384
1385 LLVMValueRef params[2] = {
1386 arg,
1387 ctx->i1true,
1388 };
1389
1390 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1391
1392 if (!rev) {
1393 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1394 * the index from LSB. Invert it by doing "31 - msb". */
1395 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1396 }
1397
1398 if (bitsize == 64) {
1399 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1400 } else if (bitsize < 32) {
1401 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1402 }
1403
1404 /* check for zero */
1405 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1406 LLVMConstInt(ctx->i32, -1, true), msb, "");
1407 }
1408
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1409 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1410 {
1411 char name[64], type[64];
1412
1413 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1414 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1415 LLVMValueRef args[2] = {a, b};
1416 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1417 }
1418
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1419 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1420 {
1421 char name[64], type[64];
1422
1423 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1424 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1425 LLVMValueRef args[2] = {a, b};
1426 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1427 }
1428
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1429 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1430 {
1431 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1432 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1433 }
1434
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1435 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1436 {
1437 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1438 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1439 }
1440
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1441 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1442 {
1443 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1444 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1445 }
1446
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1447 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1448 {
1449 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1450 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1451 }
1452
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1453 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1454 {
1455 LLVMTypeRef t = LLVMTypeOf(value);
1456 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1457 LLVMConstReal(t, 1.0));
1458 }
1459
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1460 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1461 {
1462 LLVMValueRef args[9];
1463
1464 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1465 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1466
1467 if (a->compr) {
1468 assert(ctx->gfx_level < GFX11);
1469
1470 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1471 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1472 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1473 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1474
1475 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1476 } else {
1477 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1478 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1479 args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1480 args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1481 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1482 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1483
1484 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1485 }
1486 }
1487
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1488 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1489 {
1490 struct ac_export_args args;
1491
1492 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1493 * for discard.
1494 */
1495 if (ctx->gfx_level >= GFX10 && !uses_discard)
1496 return;
1497
1498 args.enabled_channels = 0x0; /* enabled channels */
1499 args.valid_mask = 1; /* whether the EXEC mask is valid */
1500 args.done = 1; /* DONE bit */
1501 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1502 args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1503 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1504 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1505 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1506 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1507 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1508
1509 ac_build_export(ctx, &args);
1510 }
1511
ac_num_coords(enum ac_image_dim dim)1512 static unsigned ac_num_coords(enum ac_image_dim dim)
1513 {
1514 switch (dim) {
1515 case ac_image_1d:
1516 return 1;
1517 case ac_image_2d:
1518 case ac_image_1darray:
1519 return 2;
1520 case ac_image_3d:
1521 case ac_image_cube:
1522 case ac_image_2darray:
1523 case ac_image_2dmsaa:
1524 return 3;
1525 case ac_image_2darraymsaa:
1526 return 4;
1527 default:
1528 unreachable("ac_num_coords: bad dim");
1529 }
1530 }
1531
ac_num_derivs(enum ac_image_dim dim)1532 static unsigned ac_num_derivs(enum ac_image_dim dim)
1533 {
1534 switch (dim) {
1535 case ac_image_1d:
1536 case ac_image_1darray:
1537 return 2;
1538 case ac_image_2d:
1539 case ac_image_2darray:
1540 case ac_image_cube:
1541 return 4;
1542 case ac_image_3d:
1543 return 6;
1544 case ac_image_2dmsaa:
1545 case ac_image_2darraymsaa:
1546 default:
1547 unreachable("derivatives not supported");
1548 }
1549 }
1550
get_atomic_name(enum ac_atomic_op op)1551 static const char *get_atomic_name(enum ac_atomic_op op)
1552 {
1553 switch (op) {
1554 case ac_atomic_swap:
1555 return "swap";
1556 case ac_atomic_add:
1557 return "add";
1558 case ac_atomic_sub:
1559 return "sub";
1560 case ac_atomic_smin:
1561 return "smin";
1562 case ac_atomic_umin:
1563 return "umin";
1564 case ac_atomic_smax:
1565 return "smax";
1566 case ac_atomic_umax:
1567 return "umax";
1568 case ac_atomic_and:
1569 return "and";
1570 case ac_atomic_or:
1571 return "or";
1572 case ac_atomic_xor:
1573 return "xor";
1574 case ac_atomic_inc_wrap:
1575 return "inc";
1576 case ac_atomic_dec_wrap:
1577 return "dec";
1578 case ac_atomic_fmin:
1579 return "fmin";
1580 case ac_atomic_fmax:
1581 return "fmax";
1582 }
1583 unreachable("bad atomic op");
1584 }
1585
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1586 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1587 {
1588 const char *overload[3] = {"", "", ""};
1589 unsigned num_overloads = 0;
1590 LLVMValueRef args[18];
1591 unsigned num_args = 0;
1592 enum ac_image_dim dim = a->dim;
1593
1594 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1595 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1596 a->opcode != ac_image_store_mip) ||
1597 a->lod);
1598 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1599 (!a->compare && !a->offset));
1600 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1601 a->opcode == ac_image_get_lod) ||
1602 !a->bias);
1603 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1604 1);
1605 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1606 assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1607 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1608 a->opcode != ac_image_get_resinfo));
1609 assert(!a->a16 || ctx->gfx_level >= GFX9);
1610 assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1611
1612 assert(!a->offset ||
1613 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1614 assert(!a->bias ||
1615 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == (a->a16 ? 16 : 32));
1616 assert(!a->compare ||
1617 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1618 assert(!a->derivs[0] ||
1619 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1620 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1621 assert(!a->coords[0] ||
1622 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1623 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1624 assert(!a->lod ||
1625 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1626 (a->opcode == ac_image_get_resinfo ||
1627 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1628 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1629 assert(!a->min_lod ||
1630 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1631 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1632
1633 if (a->opcode == ac_image_get_lod) {
1634 switch (dim) {
1635 case ac_image_1darray:
1636 dim = ac_image_1d;
1637 break;
1638 case ac_image_2darray:
1639 case ac_image_cube:
1640 dim = ac_image_2d;
1641 break;
1642 default:
1643 break;
1644 }
1645 }
1646
1647 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1648 a->opcode == ac_image_get_lod;
1649 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1650 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1651 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1652 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1653 uint8_t dmask = a->dmask;
1654 LLVMTypeRef data_type;
1655 char data_type_str[32];
1656
1657 if (atomic) {
1658 data_type = LLVMTypeOf(a->data[0]);
1659 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1660 /* Image stores might have been shrunk using the format. */
1661 data_type = LLVMTypeOf(a->data[0]);
1662 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1663 } else {
1664 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1665 }
1666
1667 if (a->tfe) {
1668 data_type = LLVMStructTypeInContext(
1669 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1670 }
1671
1672 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1673 args[num_args++] = a->data[0];
1674 if (a->opcode == ac_image_atomic_cmpswap)
1675 args[num_args++] = a->data[1];
1676 }
1677
1678 if (!atomic)
1679 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1680
1681 if (a->offset)
1682 args[num_args++] = ac_to_integer(ctx, a->offset);
1683 if (a->bias) {
1684 args[num_args++] = ac_to_float(ctx, a->bias);
1685 overload[num_overloads++] = ".f32";
1686 }
1687 if (a->compare)
1688 args[num_args++] = ac_to_float(ctx, a->compare);
1689 if (a->derivs[0]) {
1690 unsigned count = ac_num_derivs(dim);
1691 for (unsigned i = 0; i < count; ++i)
1692 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1693 overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1694 }
1695 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1696 for (unsigned i = 0; i < num_coords; ++i)
1697 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1698 if (a->lod)
1699 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1700 if (a->min_lod)
1701 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1702
1703 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1704
1705 args[num_args++] = a->resource;
1706 if (sample) {
1707 args[num_args++] = a->sampler;
1708 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1709 }
1710
1711 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1712 args[num_args++] = LLVMConstInt(
1713 ctx->i32, get_cache_flags(ctx,
1714 a->access |
1715 (atomic ? ACCESS_TYPE_ATOMIC :
1716 load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1717 false);
1718
1719 const char *name;
1720 const char *atomic_subop = "";
1721 switch (a->opcode) {
1722 case ac_image_sample:
1723 name = "sample";
1724 break;
1725 case ac_image_gather4:
1726 name = "gather4";
1727 break;
1728 case ac_image_load:
1729 name = "load";
1730 break;
1731 case ac_image_load_mip:
1732 name = "load.mip";
1733 break;
1734 case ac_image_store:
1735 name = "store";
1736 break;
1737 case ac_image_store_mip:
1738 name = "store.mip";
1739 break;
1740 case ac_image_atomic:
1741 name = "atomic.";
1742 atomic_subop = get_atomic_name(a->atomic);
1743 break;
1744 case ac_image_atomic_cmpswap:
1745 name = "atomic.";
1746 atomic_subop = "cmpswap";
1747 break;
1748 case ac_image_get_lod:
1749 name = "getlod";
1750 break;
1751 case ac_image_get_resinfo:
1752 name = "getresinfo";
1753 break;
1754 default:
1755 unreachable("invalid image opcode");
1756 }
1757
1758 const char *dimname;
1759 switch (dim) {
1760 case ac_image_1d:
1761 dimname = "1d";
1762 break;
1763 case ac_image_2d:
1764 dimname = "2d";
1765 break;
1766 case ac_image_3d:
1767 dimname = "3d";
1768 break;
1769 case ac_image_cube:
1770 dimname = "cube";
1771 break;
1772 case ac_image_1darray:
1773 dimname = "1darray";
1774 break;
1775 case ac_image_2darray:
1776 dimname = "2darray";
1777 break;
1778 case ac_image_2dmsaa:
1779 dimname = "2dmsaa";
1780 break;
1781 case ac_image_2darraymsaa:
1782 dimname = "2darraymsaa";
1783 break;
1784 default:
1785 unreachable("invalid dim");
1786 }
1787
1788 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1789
1790 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1791 char intr_name[96];
1792 snprintf(intr_name, sizeof(intr_name),
1793 "llvm.amdgcn.image.%s%s" /* base name */
1794 "%s%s%s%s" /* sample/gather modifiers */
1795 ".%s.%s%s%s%s", /* dimension and type overloads */
1796 name, atomic_subop, a->compare ? ".c" : "",
1797 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1798 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1799 data_type_str, overload[0], overload[1], overload[2]);
1800
1801 LLVMTypeRef retty;
1802 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1803 retty = ctx->voidt;
1804 else
1805 retty = data_type;
1806
1807 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1808 if (a->tfe) {
1809 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1810 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1811 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1812 }
1813
1814 if (!sample && !atomic && retty != ctx->voidt)
1815 result = ac_to_integer(ctx, result);
1816
1817 return result;
1818 }
1819
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1820 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1821 {
1822 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1823 }
1824
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1825 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1826 {
1827 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1828 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1829 }
1830
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1831 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1832 {
1833 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1834 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1835 }
1836
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1837 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1838 LLVMValueRef args[2])
1839 {
1840 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1841 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1842 LLVMValueRef code = LLVMConstInlineAsm(calltype,
1843 ctx->gfx_level >= GFX11 ?
1844 "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1845 "v_cvt_pknorm_i16_f16 $0, $1, $2",
1846 "=v,v,v", false, false);
1847 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1848 }
1849
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1850 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
1851 LLVMValueRef args[2])
1852 {
1853 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1854 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1855 LLVMValueRef code = LLVMConstInlineAsm(calltype,
1856 ctx->gfx_level >= GFX11 ?
1857 "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
1858 "v_cvt_pknorm_u16_f16 $0, $1, $2",
1859 "=v,v,v", false, false);
1860 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1861 }
1862
1863 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)1864 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
1865 bool hi)
1866 {
1867 assert(bits == 8 || bits == 10 || bits == 16);
1868
1869 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
1870 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
1871 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
1872 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1873
1874 /* Clamp. */
1875 if (bits != 16) {
1876 for (int i = 0; i < 2; i++) {
1877 bool alpha = hi && i == 1;
1878 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
1879 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
1880 }
1881 }
1882
1883 LLVMValueRef res =
1884 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
1885 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1886 }
1887
1888 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)1889 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
1890 bool hi)
1891 {
1892 assert(bits == 8 || bits == 10 || bits == 16);
1893
1894 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
1895 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1896
1897 /* Clamp. */
1898 if (bits != 16) {
1899 for (int i = 0; i < 2; i++) {
1900 bool alpha = hi && i == 1;
1901 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
1902 }
1903 }
1904
1905 LLVMValueRef res =
1906 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
1907 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1908 }
1909
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)1910 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
1911 {
1912 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
1913 }
1914
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)1915 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
1916 {
1917 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
1918 }
1919
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)1920 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
1921 LLVMValueRef width, bool is_signed)
1922 {
1923 LLVMValueRef args[] = {
1924 input,
1925 offset,
1926 width,
1927 };
1928
1929 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
1930 ctx->i32, args, 3, 0);
1931 }
1932
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)1933 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
1934 LLVMValueRef s2)
1935 {
1936 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
1937 }
1938
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)1939 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
1940 LLVMValueRef s2)
1941 {
1942 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
1943 if (ctx->gfx_level >= GFX10)
1944 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
1945
1946 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
1947 }
1948
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)1949 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
1950 {
1951 if (!wait_flags)
1952 return;
1953
1954 if (ctx->gfx_level >= GFX12) {
1955 if (wait_flags & AC_WAIT_DS)
1956 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
1957 if (wait_flags & AC_WAIT_KM)
1958 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1959 if (wait_flags & AC_WAIT_EXP)
1960 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1961 if (wait_flags & AC_WAIT_LOAD)
1962 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1963 if (wait_flags & AC_WAIT_STORE)
1964 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
1965 if (wait_flags & AC_WAIT_SAMPLE)
1966 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
1967 if (wait_flags & AC_WAIT_BVH)
1968 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1969 } else {
1970 unsigned expcnt = 7;
1971 unsigned lgkmcnt = 63;
1972 unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
1973 unsigned vscnt = 63;
1974
1975 if (wait_flags & AC_WAIT_EXP)
1976 expcnt = 0;
1977 if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
1978 lgkmcnt = 0;
1979 if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
1980 vmcnt = 0;
1981
1982 if (wait_flags & AC_WAIT_STORE) {
1983 if (ctx->gfx_level >= GFX10)
1984 vscnt = 0;
1985 else
1986 vmcnt = 0;
1987 }
1988
1989 /* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
1990 if (vscnt == 0) {
1991 assert(!(wait_flags & AC_WAIT_EXP));
1992 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
1993 return;
1994 }
1995
1996 unsigned simm16;
1997
1998 if (ctx->gfx_level >= GFX11)
1999 simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2000 else
2001 simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2002
2003 LLVMValueRef args[1] = {
2004 LLVMConstInt(ctx->i32, simm16, false),
2005 };
2006 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2007 }
2008 }
2009
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2010 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2011 LLVMTypeRef type)
2012 {
2013 unsigned bitsize = ac_get_elem_bits(ctx, type);
2014 LLVMValueRef zero = LLVMConstReal(type, 0.0);
2015 LLVMValueRef one = LLVMConstReal(type, 1.0);
2016 LLVMValueRef result;
2017
2018 if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2019 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2020 * doesn't expose an intrinsic.
2021 */
2022 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2023 } else {
2024 LLVMTypeRef type;
2025 char *intr;
2026
2027 if (bitsize == 16) {
2028 intr = "llvm.amdgcn.fmed3.f16";
2029 type = ctx->f16;
2030 } else {
2031 assert(bitsize == 32);
2032 intr = "llvm.amdgcn.fmed3.f32";
2033 type = ctx->f32;
2034 }
2035
2036 LLVMValueRef params[] = {
2037 zero,
2038 one,
2039 src,
2040 };
2041
2042 result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2043 }
2044
2045 if (ctx->gfx_level < GFX9 && bitsize == 32) {
2046 /* Only pre-GFX9 chips do not flush denorms. */
2047 result = ac_build_canonicalize(ctx, result, bitsize);
2048 }
2049
2050 return result;
2051 }
2052
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2053 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2054 {
2055 LLVMTypeRef type;
2056 char *intr;
2057
2058 if (bitsize == 16) {
2059 intr = "llvm.amdgcn.fract.f16";
2060 type = ctx->f16;
2061 } else if (bitsize == 32) {
2062 intr = "llvm.amdgcn.fract.f32";
2063 type = ctx->f32;
2064 } else {
2065 intr = "llvm.amdgcn.fract.f64";
2066 type = ctx->f64;
2067 }
2068
2069 LLVMValueRef params[] = {
2070 src0,
2071 };
2072 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2073 }
2074
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2075 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2076 {
2077
2078 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2079 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2080 unsigned vec_size = LLVMGetVectorSize(type);
2081 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2082
2083 for (unsigned i = 0; i < vec_size; i++)
2084 scalars[i] = scalar;
2085 return LLVMConstVector(scalars, vec_size);
2086 }
2087 return LLVMConstInt(type, value, 0);
2088 }
2089
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2090 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2091 {
2092 LLVMTypeRef type = LLVMTypeOf(src0);
2093 LLVMValueRef val;
2094
2095 /* v_med3 is selected only when max is first. (LLVM bug?) */
2096 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2097 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2098 }
2099
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2100 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2101 {
2102 ac_enable_signed_zeros(ctx);
2103 /* (val + 0) converts negative zero to positive zero. */
2104 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2105 ac_disable_signed_zeros(ctx);
2106 return val;
2107 }
2108
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2109 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2110 {
2111 LLVMTypeRef type = LLVMTypeOf(src);
2112 LLVMValueRef pos, neg, dw[2], val;
2113 unsigned bitsize = ac_get_elem_bits(ctx, type);
2114
2115 /* The standard version leads to this:
2116 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2117 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2118 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2119 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2120 *
2121 * The isign version:
2122 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2123 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2124 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2125 *
2126 * (src0 + 0) converts negative zero to positive zero.
2127 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2128 *
2129 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2130 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2131 */
2132 if (bitsize == 16 || bitsize == 32) {
2133 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2134 val = ac_build_isign(ctx, val);
2135 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2136 }
2137
2138 assert(bitsize == 64);
2139 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2140 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2141 dw[0] = ctx->i32_0;
2142 dw[1] = LLVMBuildSelect(
2143 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2144 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2145 "");
2146 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2147 }
2148
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2149 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2150 {
2151 LLVMValueRef result;
2152 unsigned bitsize;
2153
2154 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2155
2156 switch (bitsize) {
2157 case 128:
2158 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2159 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2160 break;
2161 case 64:
2162 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2163 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2164 break;
2165 case 32:
2166 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2167 break;
2168 case 16:
2169 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2170 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2171 break;
2172 case 8:
2173 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2174 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2175 break;
2176 default:
2177 unreachable("invalid bitsize");
2178 break;
2179 }
2180
2181 return result;
2182 }
2183
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2184 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2185 {
2186 LLVMValueRef result;
2187 unsigned bitsize;
2188
2189 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2190
2191 switch (bitsize) {
2192 case 64:
2193 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2194 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2195 break;
2196 case 32:
2197 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2198 break;
2199 case 16:
2200 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2201 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2202 break;
2203 case 8:
2204 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2205 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2206 break;
2207 default:
2208 unreachable("invalid bitsize");
2209 break;
2210 }
2211
2212 return result;
2213 }
2214
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2215 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2216 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2217 {
2218 const char *name = "llvm.amdgcn.sudot4";
2219 LLVMValueRef src[6];
2220
2221 src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2222 src[1] = s0;
2223 src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2224 src[3] = s1;
2225 src[4] = s2;
2226 src[5] = LLVMConstInt(ctx->i1, clamp, false);
2227
2228 return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2229 }
2230
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2231 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2232 {
2233 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2234 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2235 }
2236
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2237 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2238 {
2239 unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2240 LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2241 ctx->lds = (struct ac_llvm_pointer) {
2242 .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2243 LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2244 .pointee_type = type
2245 };
2246 }
2247
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2248 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2249 {
2250 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2251 const char *intrin_name;
2252 LLVMTypeRef type;
2253 LLVMValueRef zero;
2254
2255 switch (src0_bitsize) {
2256 case 64:
2257 intrin_name = "llvm.cttz.i64";
2258 type = ctx->i64;
2259 zero = ctx->i64_0;
2260 break;
2261 case 32:
2262 intrin_name = "llvm.cttz.i32";
2263 type = ctx->i32;
2264 zero = ctx->i32_0;
2265 break;
2266 case 16:
2267 intrin_name = "llvm.cttz.i16";
2268 type = ctx->i16;
2269 zero = ctx->i16_0;
2270 break;
2271 case 8:
2272 intrin_name = "llvm.cttz.i8";
2273 type = ctx->i8;
2274 zero = ctx->i8_0;
2275 break;
2276 default:
2277 unreachable("invalid bitsize");
2278 }
2279
2280 LLVMValueRef params[2] = {
2281 src0,
2282
2283 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2284 * add special code to check for x=0. The reason is that
2285 * the LLVM behavior for x=0 is different from what we
2286 * need here. However, LLVM also assumes that ffs(x) is
2287 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2288 * a conditional assignment to handle 0 is still required.
2289 *
2290 * The hardware already implements the correct behavior.
2291 */
2292 ctx->i1true,
2293 };
2294
2295 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2296
2297 if (src0_bitsize == 64) {
2298 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2299 } else if (src0_bitsize < 32) {
2300 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2301 }
2302
2303 /* TODO: We need an intrinsic to skip this conditional. */
2304 /* Check for zero: */
2305 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2306 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2307 }
2308
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2309 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2310 switch (type) {
2311 case AC_ARG_CONST_PTR:
2312 return ctx->i8;
2313 break;
2314 case AC_ARG_CONST_FLOAT_PTR:
2315 return ctx->f32;
2316 break;
2317 case AC_ARG_CONST_PTR_PTR:
2318 return ac_array_in_const32_addr_space(ctx->i8);
2319 break;
2320 case AC_ARG_CONST_DESC_PTR:
2321 return ctx->v4i32;
2322 break;
2323 case AC_ARG_CONST_IMAGE_PTR:
2324 return ctx->v8i32;
2325 default:
2326 /* Other ac_arg_type values aren't pointers. */
2327 assert(false);
2328 return NULL;
2329 }
2330 }
2331
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2332 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2333 {
2334 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2335 }
2336
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2337 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2338 {
2339 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2340 }
2341
get_current_flow(struct ac_llvm_context * ctx)2342 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2343 {
2344 if (ctx->flow->depth > 0)
2345 return &ctx->flow->stack[ctx->flow->depth - 1];
2346 return NULL;
2347 }
2348
get_innermost_loop(struct ac_llvm_context * ctx)2349 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2350 {
2351 for (unsigned i = ctx->flow->depth; i > 0; --i) {
2352 if (ctx->flow->stack[i - 1].loop_entry_block)
2353 return &ctx->flow->stack[i - 1];
2354 }
2355 return NULL;
2356 }
2357
push_flow(struct ac_llvm_context * ctx)2358 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2359 {
2360 struct ac_llvm_flow *flow;
2361
2362 if (ctx->flow->depth >= ctx->flow->depth_max) {
2363 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2364
2365 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2366 ctx->flow->depth_max = new_max;
2367 }
2368
2369 flow = &ctx->flow->stack[ctx->flow->depth];
2370 ctx->flow->depth++;
2371
2372 flow->next_block = NULL;
2373 flow->loop_entry_block = NULL;
2374 return flow;
2375 }
2376
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2377 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2378 {
2379 char buf[32];
2380 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2381 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2382 }
2383
2384 /* Append a basic block at the level of the parent flow.
2385 */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2386 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2387 {
2388 assert(ctx->flow->depth >= 1);
2389
2390 if (ctx->flow->depth >= 2) {
2391 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2392
2393 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2394 }
2395
2396 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2397 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2398 }
2399
2400 /* Emit a branch to the given default target for the current block if
2401 * applicable -- that is, if the current block does not already contain a
2402 * branch from a break or continue.
2403 */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2404 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2405 {
2406 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2407 LLVMBuildBr(builder, target);
2408 }
2409
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2410 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2411 {
2412 struct ac_llvm_flow *flow = push_flow(ctx);
2413 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2414 flow->next_block = append_basic_block(ctx, "ENDLOOP");
2415 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2416 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2417 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2418 }
2419
ac_build_break(struct ac_llvm_context * ctx)2420 void ac_build_break(struct ac_llvm_context *ctx)
2421 {
2422 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2423 LLVMBuildBr(ctx->builder, flow->next_block);
2424 }
2425
ac_build_continue(struct ac_llvm_context * ctx)2426 void ac_build_continue(struct ac_llvm_context *ctx)
2427 {
2428 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2429 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2430 }
2431
ac_build_else(struct ac_llvm_context * ctx,int label_id)2432 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2433 {
2434 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2435 LLVMBasicBlockRef endif_block;
2436
2437 assert(!current_branch->loop_entry_block);
2438
2439 endif_block = append_basic_block(ctx, "ENDIF");
2440 emit_default_branch(ctx->builder, endif_block);
2441
2442 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2443 set_basicblock_name(current_branch->next_block, "else", label_id);
2444
2445 current_branch->next_block = endif_block;
2446 }
2447
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2448 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2449 {
2450 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2451
2452 assert(!current_branch->loop_entry_block);
2453
2454 emit_default_branch(ctx->builder, current_branch->next_block);
2455 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2456 set_basicblock_name(current_branch->next_block, "endif", label_id);
2457
2458 ctx->flow->depth--;
2459 }
2460
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2461 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2462 {
2463 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2464
2465 assert(current_loop->loop_entry_block);
2466
2467 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2468
2469 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2470 set_basicblock_name(current_loop->next_block, "endloop", label_id);
2471 ctx->flow->depth--;
2472 }
2473
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2474 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2475 {
2476 struct ac_llvm_flow *flow = push_flow(ctx);
2477 LLVMBasicBlockRef if_block;
2478
2479 if_block = append_basic_block(ctx, "IF");
2480 flow->next_block = append_basic_block(ctx, "ELSE");
2481 set_basicblock_name(if_block, "if", label_id);
2482 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2483 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2484 }
2485
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2486 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2487 {
2488 LLVMBuilderRef builder = ac->builder;
2489 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2490 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2491 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2492 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2493 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2494 LLVMValueRef res;
2495
2496 if (first_instr) {
2497 LLVMPositionBuilderBefore(first_builder, first_instr);
2498 } else {
2499 LLVMPositionBuilderAtEnd(first_builder, first_block);
2500 }
2501
2502 res = LLVMBuildAlloca(first_builder, type, name);
2503 LLVMDisposeBuilder(first_builder);
2504 return res;
2505 }
2506
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2507 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2508 {
2509 unsigned num_components = ac_get_llvm_num_components(value);
2510 if (count == num_components)
2511 return value;
2512
2513 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2514 masks[0] = ctx->i32_0;
2515 masks[1] = ctx->i32_1;
2516 for (unsigned i = 2; i < count; i++)
2517 masks[i] = LLVMConstInt(ctx->i32, i, false);
2518
2519 if (count == 1)
2520 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2521
2522 LLVMValueRef swizzle = LLVMConstVector(masks, count);
2523 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2524 }
2525
2526 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2527 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2528 unsigned bitwidth)
2529 {
2530 LLVMValueRef value = param;
2531 if (rshift)
2532 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2533
2534 if (rshift + bitwidth < 32) {
2535 uint64_t mask = (1ull << bitwidth) - 1;
2536 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2537 }
2538
2539 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2540 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2541 return value;
2542 }
2543
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2544 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2545 LLVMValueRef lane, bool with_opt_barrier)
2546 {
2547 LLVMTypeRef type = LLVMTypeOf(src);
2548 LLVMValueRef result;
2549
2550 if (with_opt_barrier)
2551 ac_build_optimization_barrier(ctx, &src, false);
2552
2553 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2554 if (lane)
2555 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2556
2557 result =
2558 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2559 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2560
2561 return LLVMBuildTrunc(ctx->builder, result, type, "");
2562 }
2563
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2564 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2565 LLVMValueRef lane, bool with_opt_barrier)
2566 {
2567 LLVMTypeRef src_type = LLVMTypeOf(src);
2568 src = ac_to_integer(ctx, src);
2569 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2570 LLVMValueRef ret;
2571
2572 if (bits > 32) {
2573 assert(bits % 32 == 0);
2574 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2575 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2576 ret = LLVMGetUndef(vec_type);
2577 for (unsigned i = 0; i < bits / 32; i++) {
2578 LLVMValueRef ret_comp;
2579
2580 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2581
2582 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2583
2584 ret =
2585 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2586 }
2587 } else {
2588 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2589 }
2590
2591 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2592 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2593 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2594 }
2595
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2596 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2597 {
2598 return ac_build_readlane_common(ctx, src, lane, true);
2599 }
2600
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2601 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2602 LLVMValueRef lane)
2603 {
2604 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2605 (LLVMValueRef[]){value, lane, src}, 3, 0);
2606 }
2607
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2608 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2609 {
2610 LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2611 LLVMValueRef val;
2612
2613 if (ctx->wave_size == 32) {
2614 if (LLVMTypeOf(mask) == ctx->i64)
2615 mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2616
2617 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2618 (LLVMValueRef[]){mask, add}, 2, 0);
2619 } else {
2620 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2621 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2622 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2623 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2624 (LLVMValueRef[]){mask_lo, add}, 2, 0);
2625 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2626 2, 0);
2627 }
2628
2629 if (add == ctx->i32_0)
2630 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2631
2632 if (LLVM_VERSION_MAJOR < 16) {
2633 /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2634 * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2635 */
2636 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2637 val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2638 }
2639
2640 return val;
2641 }
2642
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2643 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2644 {
2645 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2646 }
2647
2648 enum dpp_ctrl
2649 {
2650 _dpp_quad_perm = 0x000,
2651 _dpp_row_sl = 0x100,
2652 _dpp_row_sr = 0x110,
2653 _dpp_row_rr = 0x120,
2654 dpp_wf_sl1 = 0x130,
2655 dpp_wf_rl1 = 0x134,
2656 dpp_wf_sr1 = 0x138,
2657 dpp_wf_rr1 = 0x13C,
2658 dpp_row_mirror = 0x140,
2659 dpp_row_half_mirror = 0x141,
2660 dpp_row_bcast15 = 0x142,
2661 dpp_row_bcast31 = 0x143
2662 };
2663
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2664 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2665 unsigned lane3)
2666 {
2667 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2668 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2669 }
2670
dpp_row_sr(unsigned amount)2671 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2672 {
2673 assert(amount > 0 && amount < 16);
2674 return _dpp_row_sr | amount;
2675 }
2676
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,bool use_wqm)2677 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2678 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2679 bool bound_ctrl, bool use_wqm)
2680 {
2681 LLVMTypeRef type = LLVMTypeOf(src);
2682 LLVMValueRef res;
2683
2684 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2685 if (use_wqm)
2686 old = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &old, 1, 0);
2687 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2688 if (use_wqm)
2689 src = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &src, 1, 0);
2690
2691 res = ac_build_intrinsic(
2692 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2693 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2694 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2695 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2696 6, 0);
2697
2698 if (use_wqm)
2699 res = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &res, 1, 0);
2700
2701 return LLVMBuildTrunc(ctx->builder, res, type, "");
2702 }
2703
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,bool use_wqm)2704 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2705 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2706 bool bound_ctrl, bool use_wqm)
2707 {
2708 LLVMTypeRef src_type = LLVMTypeOf(src);
2709 src = ac_to_integer(ctx, src);
2710 if (use_wqm)
2711 src = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &src, 1, 0);
2712 old = ac_to_integer(ctx, old);
2713 if (use_wqm)
2714 old = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &old, 1, 0);
2715 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2716 LLVMValueRef ret;
2717 if (bits > 32) {
2718 assert(bits % 32 == 0);
2719 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2720 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2721 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2722 ret = LLVMGetUndef(vec_type);
2723 for (unsigned i = 0; i < bits / 32; i++) {
2724 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2725 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2726 LLVMValueRef ret_comp =
2727 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl, use_wqm);
2728 ret =
2729 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2730 }
2731 } else {
2732 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl, use_wqm);
2733 }
2734 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2735 }
2736
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2737 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2738 uint64_t sel, bool exchange_rows, bool bound_ctrl)
2739 {
2740 LLVMTypeRef type = LLVMTypeOf(src);
2741 LLVMValueRef result;
2742
2743 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2744
2745 LLVMValueRef args[6] = {
2746 src,
2747 src,
2748 LLVMConstInt(ctx->i32, sel, false),
2749 LLVMConstInt(ctx->i32, sel >> 32, false),
2750 ctx->i1true, /* fi */
2751 bound_ctrl ? ctx->i1true : ctx->i1false,
2752 };
2753
2754 result =
2755 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2756 ctx->i32, args, 6, 0);
2757
2758 return LLVMBuildTrunc(ctx->builder, result, type, "");
2759 }
2760
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2761 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2762 bool exchange_rows, bool bound_ctrl)
2763 {
2764 LLVMTypeRef src_type = LLVMTypeOf(src);
2765 src = ac_to_integer(ctx, src);
2766 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2767 LLVMValueRef ret;
2768 if (bits > 32) {
2769 assert(bits % 32 == 0);
2770 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2771 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2772 ret = LLVMGetUndef(vec_type);
2773 for (unsigned i = 0; i < bits / 32; i++) {
2774 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2775 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2776 ret =
2777 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2778 }
2779 } else {
2780 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2781 }
2782 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2783 }
2784
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2785 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2786 {
2787 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2788 return and_mask | (or_mask << 5) | (xor_mask << 10);
2789 }
2790
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2791 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2792 unsigned mask)
2793 {
2794 LLVMTypeRef src_type = LLVMTypeOf(src);
2795 LLVMValueRef ret;
2796
2797 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2798
2799 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2800 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2801 0);
2802
2803 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2804 }
2805
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2806 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2807 {
2808 LLVMTypeRef src_type = LLVMTypeOf(src);
2809 src = ac_to_integer(ctx, src);
2810 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2811 LLVMValueRef ret;
2812 if (bits > 32) {
2813 assert(bits % 32 == 0);
2814 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2815 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2816 ret = LLVMGetUndef(vec_type);
2817 for (unsigned i = 0; i < bits / 32; i++) {
2818 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2819 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
2820 ret =
2821 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2822 }
2823 } else {
2824 ret = _ac_build_ds_swizzle(ctx, src, mask);
2825 }
2826 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2827 }
2828
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)2829 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
2830 {
2831 LLVMTypeRef src_type = LLVMTypeOf(src);
2832 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2833 char name[32], type[8];
2834 LLVMValueRef ret;
2835
2836 src = ac_to_integer(ctx, src);
2837
2838 if (bitsize < 32)
2839 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2840
2841 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2842 snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
2843 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
2844
2845 if (bitsize < 32)
2846 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
2847
2848 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2849 }
2850
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)2851 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
2852 {
2853 return ac_build_mode(ctx, src, "wwm");
2854 }
2855
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)2856 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
2857 {
2858 return ac_build_mode(ctx, src, "wqm");
2859 }
2860
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)2861 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
2862 LLVMValueRef inactive)
2863 {
2864 char name[33], type[8];
2865 LLVMTypeRef src_type = LLVMTypeOf(src);
2866 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2867 src = ac_to_integer(ctx, src);
2868 inactive = ac_to_integer(ctx, inactive);
2869
2870 if (bitsize < 32) {
2871 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2872 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
2873 }
2874
2875 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2876 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
2877 LLVMValueRef ret =
2878 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
2879 if (bitsize < 32)
2880 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2881
2882 return ret;
2883 }
2884
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)2885 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
2886 unsigned type_size)
2887 {
2888
2889 if (type_size == 0) {
2890 switch (op) {
2891 case nir_op_ior:
2892 case nir_op_ixor:
2893 return ctx->i1false;
2894 case nir_op_iand:
2895 return ctx->i1true;
2896 default:
2897 unreachable("bad reduction intrinsic");
2898 }
2899 } else if (type_size == 1) {
2900 switch (op) {
2901 case nir_op_iadd:
2902 return ctx->i8_0;
2903 case nir_op_imul:
2904 return ctx->i8_1;
2905 case nir_op_imin:
2906 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
2907 case nir_op_umin:
2908 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
2909 case nir_op_imax:
2910 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
2911 case nir_op_umax:
2912 return ctx->i8_0;
2913 case nir_op_iand:
2914 return LLVMConstInt(ctx->i8, -1, 0);
2915 case nir_op_ior:
2916 return ctx->i8_0;
2917 case nir_op_ixor:
2918 return ctx->i8_0;
2919 default:
2920 unreachable("bad reduction intrinsic");
2921 }
2922 } else if (type_size == 2) {
2923 switch (op) {
2924 case nir_op_iadd:
2925 return ctx->i16_0;
2926 case nir_op_fadd:
2927 return ctx->f16_0;
2928 case nir_op_imul:
2929 return ctx->i16_1;
2930 case nir_op_fmul:
2931 return ctx->f16_1;
2932 case nir_op_imin:
2933 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
2934 case nir_op_umin:
2935 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
2936 case nir_op_fmin:
2937 return LLVMConstReal(ctx->f16, INFINITY);
2938 case nir_op_imax:
2939 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
2940 case nir_op_umax:
2941 return ctx->i16_0;
2942 case nir_op_fmax:
2943 return LLVMConstReal(ctx->f16, -INFINITY);
2944 case nir_op_iand:
2945 return LLVMConstInt(ctx->i16, -1, 0);
2946 case nir_op_ior:
2947 return ctx->i16_0;
2948 case nir_op_ixor:
2949 return ctx->i16_0;
2950 default:
2951 unreachable("bad reduction intrinsic");
2952 }
2953 } else if (type_size == 4) {
2954 switch (op) {
2955 case nir_op_iadd:
2956 return ctx->i32_0;
2957 case nir_op_fadd:
2958 return ctx->f32_0;
2959 case nir_op_imul:
2960 return ctx->i32_1;
2961 case nir_op_fmul:
2962 return ctx->f32_1;
2963 case nir_op_imin:
2964 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
2965 case nir_op_umin:
2966 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
2967 case nir_op_fmin:
2968 return LLVMConstReal(ctx->f32, INFINITY);
2969 case nir_op_imax:
2970 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
2971 case nir_op_umax:
2972 return ctx->i32_0;
2973 case nir_op_fmax:
2974 return LLVMConstReal(ctx->f32, -INFINITY);
2975 case nir_op_iand:
2976 return LLVMConstInt(ctx->i32, -1, 0);
2977 case nir_op_ior:
2978 return ctx->i32_0;
2979 case nir_op_ixor:
2980 return ctx->i32_0;
2981 default:
2982 unreachable("bad reduction intrinsic");
2983 }
2984 } else { /* type_size == 64bit */
2985 switch (op) {
2986 case nir_op_iadd:
2987 return ctx->i64_0;
2988 case nir_op_fadd:
2989 return ctx->f64_0;
2990 case nir_op_imul:
2991 return ctx->i64_1;
2992 case nir_op_fmul:
2993 return ctx->f64_1;
2994 case nir_op_imin:
2995 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
2996 case nir_op_umin:
2997 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
2998 case nir_op_fmin:
2999 return LLVMConstReal(ctx->f64, INFINITY);
3000 case nir_op_imax:
3001 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3002 case nir_op_umax:
3003 return ctx->i64_0;
3004 case nir_op_fmax:
3005 return LLVMConstReal(ctx->f64, -INFINITY);
3006 case nir_op_iand:
3007 return LLVMConstInt(ctx->i64, -1, 0);
3008 case nir_op_ior:
3009 return ctx->i64_0;
3010 case nir_op_ixor:
3011 return ctx->i64_0;
3012 default:
3013 unreachable("bad reduction intrinsic");
3014 }
3015 }
3016 }
3017
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3018 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3019 nir_op op)
3020 {
3021 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3022 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3023 switch (op) {
3024 case nir_op_iadd:
3025 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3026 case nir_op_fadd:
3027 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3028 case nir_op_imul:
3029 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3030 case nir_op_fmul:
3031 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3032 case nir_op_imin:
3033 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3034 lhs, rhs, "");
3035 case nir_op_umin:
3036 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3037 lhs, rhs, "");
3038 case nir_op_fmin:
3039 return ac_build_intrinsic(
3040 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3041 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3042 case nir_op_imax:
3043 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3044 lhs, rhs, "");
3045 case nir_op_umax:
3046 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3047 lhs, rhs, "");
3048 case nir_op_fmax:
3049 return ac_build_intrinsic(
3050 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3051 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3052 case nir_op_iand:
3053 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3054 case nir_op_ior:
3055 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3056 case nir_op_ixor:
3057 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3058 default:
3059 unreachable("bad reduction intrinsic");
3060 }
3061 }
3062
3063 /**
3064 * \param src The value to shift.
3065 * \param identity The value to use the first lane.
3066 * \param maxprefix specifies that the result only needs to be correct for a
3067 * prefix of this many threads
3068 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3069 */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3070 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3071 LLVMValueRef identity, unsigned maxprefix)
3072 {
3073 if (ctx->gfx_level >= GFX10) {
3074 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3075 LLVMValueRef active, tmp1, tmp2;
3076 LLVMValueRef tid = ac_get_thread_id(ctx);
3077
3078 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false, false);
3079
3080 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3081
3082 if (maxprefix > 32) {
3083 active =
3084 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3085
3086 tmp2 = LLVMBuildSelect(ctx->builder, active,
3087 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3088 tmp2, "");
3089
3090 active = LLVMBuildOr(
3091 ctx->builder, active,
3092 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3093 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3094 LLVMConstInt(ctx->i32, 0x10, false), ""),
3095 "");
3096 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3097 } else if (maxprefix > 16) {
3098 active =
3099 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3100
3101 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3102 }
3103 } else if (ctx->gfx_level >= GFX8) {
3104 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false, false);
3105 }
3106
3107 /* wavefront shift_right by 1 on SI/CI */
3108 LLVMValueRef active, tmp1, tmp2;
3109 LLVMValueRef tid = ac_get_thread_id(ctx);
3110 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3111 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3112 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3113 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3114 LLVMConstInt(ctx->i32, 0x4, 0), "");
3115 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3116 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3117 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3118 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3119 LLVMConstInt(ctx->i32, 0x8, 0), "");
3120 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3121 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3122 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3123 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3124 LLVMConstInt(ctx->i32, 0x10, 0), "");
3125 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3126 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3127 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3128 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3129 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3130 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3131 }
3132
3133 /**
3134 * \param maxprefix specifies that the result only needs to be correct for a
3135 * prefix of this many threads
3136 */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3137 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3138 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3139 {
3140 LLVMValueRef result, tmp;
3141
3142 if (!inclusive)
3143 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3144
3145 result = src;
3146
3147 if (ctx->gfx_level <= GFX7) {
3148 assert(maxprefix == 64);
3149 LLVMValueRef tid = ac_get_thread_id(ctx);
3150 LLVMValueRef active;
3151 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3152 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3153 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3154 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3155 result = ac_build_alu_op(ctx, result, tmp, op);
3156 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3157 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3158 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3159 ctx->i32_0, "");
3160 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3161 result = ac_build_alu_op(ctx, result, tmp, op);
3162 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3163 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3164 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3165 ctx->i32_0, "");
3166 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3167 result = ac_build_alu_op(ctx, result, tmp, op);
3168 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3169 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3170 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3171 ctx->i32_0, "");
3172 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3173 result = ac_build_alu_op(ctx, result, tmp, op);
3174 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3175 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3176 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3177 ctx->i32_0, "");
3178 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3179 result = ac_build_alu_op(ctx, result, tmp, op);
3180 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3181 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3182 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3183 ctx->i32_0, "");
3184 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3185 result = ac_build_alu_op(ctx, result, tmp, op);
3186 return result;
3187 }
3188
3189 if (maxprefix <= 1)
3190 return result;
3191 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false, false);
3192 result = ac_build_alu_op(ctx, result, tmp, op);
3193 if (maxprefix <= 2)
3194 return result;
3195 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false, false);
3196 result = ac_build_alu_op(ctx, result, tmp, op);
3197 if (maxprefix <= 3)
3198 return result;
3199 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false, false);
3200 result = ac_build_alu_op(ctx, result, tmp, op);
3201 if (maxprefix <= 4)
3202 return result;
3203 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false, false);
3204 result = ac_build_alu_op(ctx, result, tmp, op);
3205 if (maxprefix <= 8)
3206 return result;
3207 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false, false);
3208 result = ac_build_alu_op(ctx, result, tmp, op);
3209 if (maxprefix <= 16)
3210 return result;
3211
3212 if (ctx->gfx_level >= GFX10) {
3213 LLVMValueRef tid = ac_get_thread_id(ctx);
3214 LLVMValueRef active;
3215
3216 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3217
3218 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3219 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3220 ctx->i32_0, "");
3221
3222 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3223
3224 result = ac_build_alu_op(ctx, result, tmp, op);
3225
3226 if (maxprefix <= 32)
3227 return result;
3228
3229 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3230
3231 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3232
3233 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3234
3235 result = ac_build_alu_op(ctx, result, tmp, op);
3236 return result;
3237 }
3238
3239 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false, false);
3240 result = ac_build_alu_op(ctx, result, tmp, op);
3241 if (maxprefix <= 32)
3242 return result;
3243 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false, false);
3244 result = ac_build_alu_op(ctx, result, tmp, op);
3245 return result;
3246 }
3247
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3248 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3249 {
3250 LLVMValueRef result;
3251
3252 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3253 LLVMBuilderRef builder = ctx->builder;
3254 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3255 result = ac_build_ballot(ctx, src);
3256 result = ac_build_mbcnt(ctx, result);
3257 result = LLVMBuildAdd(builder, result, src, "");
3258 return result;
3259 }
3260
3261 ac_build_optimization_barrier(ctx, &src, false);
3262
3263 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3264 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3265 LLVMTypeOf(identity), "");
3266 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3267
3268 return ac_build_wwm(ctx, result);
3269 }
3270
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3271 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3272 {
3273 LLVMValueRef result;
3274
3275 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3276 LLVMBuilderRef builder = ctx->builder;
3277 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3278 result = ac_build_ballot(ctx, src);
3279 result = ac_build_mbcnt(ctx, result);
3280 return result;
3281 }
3282
3283 ac_build_optimization_barrier(ctx, &src, false);
3284
3285 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3286 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3287 LLVMTypeOf(identity), "");
3288 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3289
3290 return ac_build_wwm(ctx, result);
3291 }
3292
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3293 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3294 unsigned cluster_size)
3295 {
3296 if (cluster_size == 1)
3297 return src;
3298 ac_build_optimization_barrier(ctx, &src, false);
3299 LLVMValueRef result, swap;
3300 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3301 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3302 LLVMTypeOf(identity), "");
3303 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2, false);
3304 result = ac_build_alu_op(ctx, result, swap, op);
3305 if (cluster_size == 2)
3306 return ac_build_wwm(ctx, result);
3307
3308 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1, false);
3309 result = ac_build_alu_op(ctx, result, swap, op);
3310 if (cluster_size == 4)
3311 return ac_build_wwm(ctx, result);
3312
3313 if (ctx->gfx_level >= GFX8)
3314 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false, false);
3315 else
3316 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3317 result = ac_build_alu_op(ctx, result, swap, op);
3318 if (cluster_size == 8)
3319 return ac_build_wwm(ctx, result);
3320
3321 if (ctx->gfx_level >= GFX8)
3322 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false, false);
3323 else
3324 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3325 result = ac_build_alu_op(ctx, result, swap, op);
3326 if (cluster_size == 16)
3327 return ac_build_wwm(ctx, result);
3328
3329 if (ctx->gfx_level >= GFX10)
3330 swap = ac_build_permlane16(ctx, result, 0, true, false);
3331 else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3332 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false, false);
3333 else
3334 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3335 result = ac_build_alu_op(ctx, result, swap, op);
3336 if (cluster_size == 32)
3337 return ac_build_wwm(ctx, result);
3338
3339 if (ctx->gfx_level >= GFX8) {
3340 if (ctx->wave_size == 64) {
3341 if (ctx->gfx_level >= GFX10)
3342 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3343 else
3344 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false, false);
3345 result = ac_build_alu_op(ctx, result, swap, op);
3346 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3347 }
3348
3349 return ac_build_wwm(ctx, result);
3350 } else {
3351 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3352 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3353 result = ac_build_alu_op(ctx, result, swap, op);
3354 return ac_build_wwm(ctx, result);
3355 }
3356 }
3357
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3358 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3359 LLVMValueRef *arg0, LLVMValueRef *arg1)
3360 {
3361 LLVMValueRef tid;
3362 LLVMValueRef src0, src1;
3363 LLVMValueRef tmp0;
3364 LLVMValueRef params[2];
3365 LLVMValueRef is_even;
3366
3367 src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3368 src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3369
3370 /* swap odd,even lanes of arg_0*/
3371 params[0] = src0;
3372 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3373 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3374 ctx->i32, params, 2, 0);
3375
3376 /* swap even lanes between arg_0 and arg_1 */
3377 tid = ac_get_thread_id(ctx);
3378 is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3379 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3380 ctx->i32_0, "");
3381 tmp0 = src0;
3382 src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3383 src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3384
3385 /* swap odd,even lanes again for arg_0*/
3386 params[0] = src0;
3387 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3388 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3389 ctx->i32, params, 2, 0);
3390
3391 *arg0 = src0;
3392 *arg1 = src1;
3393 }
3394
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3395 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3396 struct ac_export_args *mrt0,
3397 struct ac_export_args *mrt1)
3398 {
3399 assert(ctx->gfx_level >= GFX11);
3400 assert(mrt0->enabled_channels == mrt1->enabled_channels);
3401
3402 for (int i = 0; i < 4; i++) {
3403 if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3404 _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3405 }
3406 }
3407
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3,bool use_wqm)3408 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3409 unsigned lane1, unsigned lane2, unsigned lane3,
3410 bool use_wqm)
3411 {
3412 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3413 if (ctx->gfx_level >= GFX8) {
3414 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false, use_wqm);
3415 } else {
3416 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3417 }
3418 }
3419
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3420 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3421 {
3422 LLVMTypeRef type = LLVMTypeOf(src);
3423 LLVMValueRef result;
3424
3425 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3426 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3427
3428 result =
3429 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3430 return LLVMBuildTrunc(ctx->builder, result, type, "");
3431 }
3432
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3433 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3434 {
3435 LLVMTypeRef type;
3436 char *intr;
3437
3438 if (bitsize == 16) {
3439 intr = "llvm.amdgcn.frexp.exp.i16.f16";
3440 type = ctx->i16;
3441 } else if (bitsize == 32) {
3442 intr = "llvm.amdgcn.frexp.exp.i32.f32";
3443 type = ctx->i32;
3444 } else {
3445 intr = "llvm.amdgcn.frexp.exp.i32.f64";
3446 type = ctx->i32;
3447 }
3448
3449 LLVMValueRef params[] = {
3450 src0,
3451 };
3452 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3453 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3454 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3455 {
3456 LLVMTypeRef type;
3457 char *intr;
3458
3459 if (bitsize == 16) {
3460 intr = "llvm.amdgcn.frexp.mant.f16";
3461 type = ctx->f16;
3462 } else if (bitsize == 32) {
3463 intr = "llvm.amdgcn.frexp.mant.f32";
3464 type = ctx->f32;
3465 } else {
3466 intr = "llvm.amdgcn.frexp.mant.f64";
3467 type = ctx->f64;
3468 }
3469
3470 LLVMValueRef params[] = {
3471 src0,
3472 };
3473 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3474 }
3475
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3476 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3477 {
3478 LLVMTypeRef type;
3479 char *intr;
3480
3481 if (bitsize == 16) {
3482 intr = "llvm.canonicalize.f16";
3483 type = ctx->f16;
3484 } else if (bitsize == 32) {
3485 intr = "llvm.canonicalize.f32";
3486 type = ctx->f32;
3487 } else {
3488 intr = "llvm.canonicalize.f64";
3489 type = ctx->f64;
3490 }
3491
3492 LLVMValueRef params[] = {
3493 src0,
3494 };
3495 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3496 }
3497
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3498 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3499 {
3500 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3501
3502 return LLVMBuildNot(ctx->builder, result, "");
3503 }
3504
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3505 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3506 unsigned num_args)
3507 {
3508 LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3509 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3510 return ret;
3511 }
3512
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3513 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3514 LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3515 struct ac_export_args *args)
3516 {
3517 unsigned mask = 0;
3518 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3519 mrt0_alpha != NULL);
3520
3521 assert(depth || stencil || samplemask || mrt0_alpha);
3522
3523 memset(args, 0, sizeof(*args));
3524
3525 if (is_last) {
3526 args->valid_mask = 1; /* whether the EXEC mask is valid */
3527 args->done = 1; /* DONE bit */
3528 }
3529
3530 /* Specify the target we are exporting */
3531 args->target = V_008DFC_SQ_EXP_MRTZ;
3532
3533 args->compr = 0; /* COMP flag */
3534 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3535 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3536 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3537 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3538
3539 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3540 assert(!depth);
3541 args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3542
3543 if (stencil) {
3544 /* Stencil should be in X[23:16]. */
3545 stencil = ac_to_integer(ctx, stencil);
3546 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3547 args->out[0] = ac_to_float(ctx, stencil);
3548 mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3549 }
3550 if (samplemask) {
3551 /* SampleMask should be in Y[15:0]. */
3552 args->out[1] = samplemask;
3553 mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3554 }
3555 } else {
3556 if (depth) {
3557 args->out[0] = depth;
3558 mask |= 0x1;
3559 }
3560 if (stencil) {
3561 assert(format == V_028710_SPI_SHADER_32_GR ||
3562 format == V_028710_SPI_SHADER_32_ABGR);
3563 args->out[1] = stencil;
3564 mask |= 0x2;
3565 }
3566 if (samplemask) {
3567 assert(format == V_028710_SPI_SHADER_32_ABGR);
3568 args->out[2] = samplemask;
3569 mask |= 0x4;
3570 }
3571 if (mrt0_alpha) {
3572 assert(format == V_028710_SPI_SHADER_32_AR ||
3573 format == V_028710_SPI_SHADER_32_ABGR);
3574 if (format == V_028710_SPI_SHADER_32_AR && ctx->gfx_level >= GFX10) {
3575 args->out[1] = mrt0_alpha;
3576 mask |= 0x2;
3577 } else {
3578 args->out[3] = mrt0_alpha;
3579 mask |= 0x8;
3580 }
3581 }
3582 }
3583
3584 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3585 * at the X writemask component. */
3586 if (ctx->gfx_level == GFX6 &&
3587 ctx->info->family != CHIP_OLAND &&
3588 ctx->info->family != CHIP_HAINAN)
3589 mask |= 0x1;
3590
3591 /* Specify which components to enable */
3592 args->enabled_channels = mask;
3593 }
3594
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3595 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3596 {
3597 LLVMTypeRef base;
3598 switch (type) {
3599 case AC_ARG_FLOAT:
3600 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3601 case AC_ARG_INT:
3602 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3603 case AC_ARG_CONST_PTR:
3604 base = ctx->i8;
3605 break;
3606 case AC_ARG_CONST_FLOAT_PTR:
3607 base = ctx->f32;
3608 break;
3609 case AC_ARG_CONST_PTR_PTR:
3610 base = ac_array_in_const32_addr_space(ctx->i8);
3611 break;
3612 case AC_ARG_CONST_DESC_PTR:
3613 base = ctx->v4i32;
3614 break;
3615 case AC_ARG_CONST_IMAGE_PTR:
3616 base = ctx->v8i32;
3617 break;
3618 default:
3619 assert(false);
3620 return NULL;
3621 }
3622
3623 assert(base);
3624 if (size == 1) {
3625 return ac_array_in_const32_addr_space(base);
3626 } else {
3627 assert(size == 2);
3628 return ac_array_in_const_addr_space(base);
3629 }
3630 }
3631
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3632 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3633 enum ac_llvm_calling_convention convention, const char *name,
3634 LLVMTypeRef ret_type, LLVMModuleRef module)
3635 {
3636 LLVMTypeRef arg_types[AC_MAX_ARGS];
3637 enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3638
3639 /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3640 * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3641 */
3642 unsigned arg_count = 0;
3643 for (unsigned i = 0; i < args->arg_count; i++) {
3644 if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3645 ctx->ring_offsets_index = i;
3646 continue;
3647 }
3648 arg_regfiles[arg_count] = args->args[i].file;
3649 arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3650 }
3651
3652 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3653
3654 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3655 LLVMBasicBlockRef main_function_body =
3656 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3657 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3658
3659 LLVMSetFunctionCallConv(main_function, convention);
3660 for (unsigned i = 0; i < arg_count; ++i) {
3661 LLVMValueRef P = LLVMGetParam(main_function, i);
3662
3663 if (arg_regfiles[i] != AC_ARG_SGPR)
3664 continue;
3665
3666 ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3667
3668 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3669 ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3670 ac_add_attr_dereferenceable(P, UINT64_MAX);
3671 ac_add_attr_alignment(P, 4);
3672 }
3673 }
3674
3675 if (args->ring_offsets.used) {
3676 ctx->ring_offsets =
3677 ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3678 LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3679 ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3680 ac_array_in_const_addr_space(ctx->v4i32), "");
3681 }
3682
3683 ctx->main_function = (struct ac_llvm_pointer) {
3684 .value = main_function,
3685 .pointee_type = main_function_type
3686 };
3687
3688 /* Enable denormals for FP16 and FP64: */
3689 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3690 /* Disable denormals for FP32: */
3691 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3692 "preserve-sign,preserve-sign");
3693
3694 if (convention == AC_LLVM_AMDGPU_PS) {
3695 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3696 ctx->exports_mrtz ? "1" : "0");
3697 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3698 ctx->exports_color_null ? "1" : "0");
3699 }
3700
3701 return ctx->main_function;
3702 }
3703
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3704 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3705 {
3706 LLVMValueRef args[2] = {
3707 a,
3708 LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3709 };
3710 return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3711 }
3712