1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21
22 #include <assert.h>
23 #include <stdio.h>
24
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28 */
29 struct ac_llvm_flow {
30 /* Loop exit or next part of if/else/endif. */
31 LLVMBasicBlockRef next_block;
32 LLVMBasicBlockRef loop_entry_block;
33 };
34
35 /* Initialize module-independent parts of the context.
36 *
37 * The caller is responsible for initializing ctx::module and ctx::builder.
38 */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40 const struct radeon_info *info, enum ac_float_mode float_mode,
41 unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42 bool exports_mrtz)
43 {
44 ctx->context = LLVMContextCreate();
45
46 ctx->info = info;
47 ctx->gfx_level = info->gfx_level;
48 ctx->wave_size = wave_size;
49 ctx->ballot_mask_bits = ballot_mask_bits;
50 ctx->float_mode = float_mode;
51 ctx->exports_color_null = exports_color_null;
52 ctx->exports_mrtz = exports_mrtz;
53 ctx->module = ac_create_module(compiler->tm, ctx->context);
54 ctx->builder = ac_create_builder(ctx->context, float_mode);
55
56 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63 ctx->intptr = ctx->i32;
64 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67 ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81
82 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98
99 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101
102 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106
107 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108
109 LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110 ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111
112 ctx->flow = calloc(1, sizeof(*ctx->flow));
113
114 ctx->ring_offsets_index = INT32_MAX;
115 }
116
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119 free(ctx->flow->stack);
120 free(ctx->flow);
121 ctx->flow = NULL;
122 }
123
ac_get_llvm_num_components(LLVMValueRef value)124 int ac_get_llvm_num_components(LLVMValueRef value)
125 {
126 LLVMTypeRef type = LLVMTypeOf(value);
127 unsigned num_components =
128 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
129 return num_components;
130 }
131
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)132 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
133 {
134 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
135 assert(index == 0);
136 return value;
137 }
138
139 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
140 }
141
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)142 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
143 {
144 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
145 type = LLVMGetElementType(type);
146
147 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
148 return LLVMGetIntTypeWidth(type);
149
150 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
151 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
152 return 32;
153 }
154
155 if (type == ctx->f16)
156 return 16;
157 if (type == ctx->f32)
158 return 32;
159 if (type == ctx->f64)
160 return 64;
161
162 unreachable("Unhandled type kind in get_elem_bits");
163 }
164
ac_get_type_size(LLVMTypeRef type)165 unsigned ac_get_type_size(LLVMTypeRef type)
166 {
167 LLVMTypeKind kind = LLVMGetTypeKind(type);
168
169 switch (kind) {
170 case LLVMIntegerTypeKind:
171 return LLVMGetIntTypeWidth(type) / 8;
172 case LLVMHalfTypeKind:
173 return 2;
174 case LLVMFloatTypeKind:
175 return 4;
176 case LLVMDoubleTypeKind:
177 return 8;
178 case LLVMPointerTypeKind:
179 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
180 return 4;
181 return 8;
182 case LLVMVectorTypeKind:
183 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
184 case LLVMArrayTypeKind:
185 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
186 default:
187 assert(0);
188 return 0;
189 }
190 }
191
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)192 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
193 {
194 if (t == ctx->i1)
195 return ctx->i1;
196 else if (t == ctx->i8)
197 return ctx->i8;
198 else if (t == ctx->f16 || t == ctx->i16)
199 return ctx->i16;
200 else if (t == ctx->f32 || t == ctx->i32)
201 return ctx->i32;
202 else if (t == ctx->f64 || t == ctx->i64)
203 return ctx->i64;
204 else
205 unreachable("Unhandled integer size");
206 }
207
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)208 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
209 {
210 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
211 LLVMTypeRef elem_type = LLVMGetElementType(t);
212 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
213 }
214 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
215 switch (LLVMGetPointerAddressSpace(t)) {
216 case AC_ADDR_SPACE_GLOBAL:
217 case AC_ADDR_SPACE_CONST:
218 return ctx->i64;
219 case AC_ADDR_SPACE_CONST_32BIT:
220 case AC_ADDR_SPACE_LDS:
221 return ctx->i32;
222 default:
223 unreachable("unhandled address space");
224 }
225 }
226 return to_integer_type_scalar(ctx, t);
227 }
228
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)229 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
230 {
231 LLVMTypeRef type = LLVMTypeOf(v);
232 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
233 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
234 }
235 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236 }
237
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)238 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
239 {
240 LLVMTypeRef type = LLVMTypeOf(v);
241 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
242 return v;
243 return ac_to_integer(ctx, v);
244 }
245
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)246 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
247 {
248 if (t == ctx->i8)
249 return ctx->i8;
250 else if (t == ctx->i16 || t == ctx->f16)
251 return ctx->f16;
252 else if (t == ctx->i32 || t == ctx->f32)
253 return ctx->f32;
254 else if (t == ctx->i64 || t == ctx->f64)
255 return ctx->f64;
256 else
257 unreachable("Unhandled float size");
258 }
259
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)260 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
261 {
262 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
263 LLVMTypeRef elem_type = LLVMGetElementType(t);
264 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
265 }
266 return to_float_type_scalar(ctx, t);
267 }
268
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)269 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
270 {
271 LLVMTypeRef type = LLVMTypeOf(v);
272 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
273 }
274
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)275 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
276 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
277 unsigned attrib_mask)
278 {
279 LLVMValueRef call;
280
281 LLVMTypeRef param_types[32];
282 assert(param_count <= 32);
283 for (unsigned i = 0; i < param_count; ++i) {
284 assert(params[i]);
285 param_types[i] = LLVMTypeOf(params[i]);
286 }
287
288 LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
289 LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
290
291 if (!function) {
292 function = LLVMAddFunction(ctx->module, name, function_type);
293
294 LLVMSetFunctionCallConv(function, LLVMCCallConv);
295 LLVMSetLinkage(function, LLVMExternalLinkage);
296 }
297
298 call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
299
300 if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
301 LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
302
303 if (attrib_mask & AC_ATTR_CONVERGENT)
304 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
305
306 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
307 return call;
308 }
309
310 /**
311 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
312 * intrinsic names).
313 */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)314 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
315 {
316 LLVMTypeRef elem_type = type;
317
318 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
319 unsigned count = LLVMCountStructElementTypes(type);
320 int ret = snprintf(buf, bufsize, "sl_");
321 buf += ret;
322 bufsize -= ret;
323
324 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
325 LLVMGetStructElementTypes(type, elems);
326
327 for (unsigned i = 0; i < count; i++) {
328 ac_build_type_name_for_intr(elems[i], buf, bufsize);
329 ret = strlen(buf);
330 buf += ret;
331 bufsize -= ret;
332 }
333
334 snprintf(buf, bufsize, "s");
335 return;
336 }
337
338 assert(bufsize >= 8);
339 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
340 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
341 if (ret < 0) {
342 char *type_name = LLVMPrintTypeToString(type);
343 fprintf(stderr, "Error building type name for: %s\n", type_name);
344 LLVMDisposeMessage(type_name);
345 return;
346 }
347 elem_type = LLVMGetElementType(type);
348 buf += ret;
349 bufsize -= ret;
350 }
351 switch (LLVMGetTypeKind(elem_type)) {
352 default:
353 break;
354 case LLVMIntegerTypeKind:
355 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
356 break;
357 case LLVMHalfTypeKind:
358 snprintf(buf, bufsize, "f16");
359 break;
360 case LLVMFloatTypeKind:
361 snprintf(buf, bufsize, "f32");
362 break;
363 case LLVMDoubleTypeKind:
364 snprintf(buf, bufsize, "f64");
365 break;
366 }
367 }
368
369 /**
370 * Helper function that builds an LLVM IR PHI node and immediately adds
371 * incoming edges.
372 */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)373 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
374 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
375 {
376 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
377 LLVMAddIncoming(phi, values, blocks, count_incoming);
378 return phi;
379 }
380
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)381 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
382 {
383 /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
384 * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
385 */
386 if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
387 return;
388
389 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
390 }
391
392 /* Prevent optimizations (at least of memory accesses) across the current
393 * point in the program by emitting empty inline assembly that is marked as
394 * having side effects.
395 *
396 * Optionally, a value can be passed through the inline assembly to prevent
397 * LLVM from hoisting calls to ReadNone functions.
398 */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)399 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
400 {
401 static int counter = 0;
402
403 LLVMBuilderRef builder = ctx->builder;
404 char code[16];
405 const char *constraint = sgpr ? "=s,0" : "=v,0";
406
407 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
408
409 if (!pgpr) {
410 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
411 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
412 LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
413 } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
414 /* Simple version for i32 that allows the caller to set LLVM metadata on the call
415 * instruction. */
416 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
417 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
418
419 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
420 } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
421 /* Simple version for i16 that allows the caller to set LLVM metadata on the call
422 * instruction. */
423 LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
424 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
425
426 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
427 } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
428 LLVMTypeRef type = LLVMTypeOf(*pgpr);
429 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
430 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
431
432 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
433 } else {
434 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
435 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
436 LLVMTypeRef type = LLVMTypeOf(*pgpr);
437 unsigned bitsize = ac_get_elem_bits(ctx, type);
438 LLVMValueRef vgpr = *pgpr;
439 LLVMTypeRef vgpr_type;
440 unsigned vgpr_size;
441 LLVMValueRef vgpr0;
442
443 if (bitsize < 32)
444 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
445
446 vgpr_type = LLVMTypeOf(vgpr);
447 vgpr_size = ac_get_type_size(vgpr_type);
448
449 assert(vgpr_size % 4 == 0);
450
451 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
452 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
453 vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, "");
454 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
455 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
456
457 if (bitsize < 32)
458 vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
459
460 *pgpr = vgpr;
461 }
462 }
463
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)464 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
465 {
466 if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
467 const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
468 LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
469 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
470 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
471 }
472
473 const char *subgroup = "llvm.readcyclecounter";
474 const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
475
476 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
477 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
478 }
479
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)480 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
481 {
482 const char *name;
483
484 if (LLVMTypeOf(value) == ctx->i1)
485 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
486
487 if (ctx->wave_size == 64)
488 name = "llvm.amdgcn.icmp.i64.i32";
489 else
490 name = "llvm.amdgcn.icmp.i32.i32";
491
492 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
493
494 /* We currently have no other way to prevent LLVM from lifting the icmp
495 * calls to a dominating basic block.
496 */
497 ac_build_optimization_barrier(ctx, &args[0], false);
498
499 args[0] = ac_to_integer(ctx, args[0]);
500
501 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
502 }
503
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)504 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
505 {
506 const char *name;
507
508 if (ctx->wave_size == 64)
509 name = "llvm.amdgcn.icmp.i64.i1";
510 else
511 name = "llvm.amdgcn.icmp.i32.i1";
512
513 LLVMValueRef args[3] = {
514 value,
515 ctx->i1false,
516 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
517 };
518
519 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
520 }
521
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)522 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
523 {
524 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
525 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
526 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
527 }
528
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)529 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
530 {
531 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
532 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
533 "");
534 }
535
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)536 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
537 {
538 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
539 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
540
541 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
542 LLVMValueRef none =
543 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
544 return LLVMBuildOr(ctx->builder, all, none, "");
545 }
546
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)547 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
548 unsigned value_count, unsigned component)
549 {
550 LLVMValueRef vec = NULL;
551
552 if (value_count == 1) {
553 return values[component];
554 } else if (!value_count)
555 unreachable("value_count is 0");
556
557 for (unsigned i = component; i < value_count + component; i++) {
558 LLVMValueRef value = values[i];
559
560 if (i == component)
561 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
562 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
563 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
564 }
565 return vec;
566 }
567
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)568 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
569 unsigned value_count, unsigned value_stride,
570 bool always_vector)
571 {
572 LLVMBuilderRef builder = ctx->builder;
573 LLVMValueRef vec = NULL;
574 unsigned i;
575
576 if (value_count == 1 && !always_vector) {
577 return values[0];
578 } else if (!value_count)
579 unreachable("value_count is 0");
580
581 for (i = 0; i < value_count; i++) {
582 LLVMValueRef value = values[i * value_stride];
583
584 if (!i)
585 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
586 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
587 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
588 }
589 return vec;
590 }
591
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)592 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
593 unsigned value_count)
594 {
595 return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
596 }
597
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)598 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
599 {
600 if (!a)
601 return b;
602
603 unsigned a_size = ac_get_llvm_num_components(a);
604 unsigned b_size = ac_get_llvm_num_components(b);
605
606 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
607 for (unsigned i = 0; i < a_size; i++)
608 elems[i] = ac_llvm_extract_elem(ctx, a, i);
609 for (unsigned i = 0; i < b_size; i++)
610 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
611
612 return ac_build_gather_values(ctx, elems, a_size + b_size);
613 }
614
615 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
616 * channels with undef. Extract at most src_channels components from the input.
617 */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)618 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
619 unsigned src_channels, unsigned dst_channels)
620 {
621 LLVMTypeRef elemtype;
622 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
623
624 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
625 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
626
627 if (src_channels == dst_channels && vec_size == dst_channels)
628 return value;
629
630 src_channels = MIN2(src_channels, vec_size);
631
632 for (unsigned i = 0; i < src_channels; i++)
633 chan[i] = ac_llvm_extract_elem(ctx, value, i);
634
635 elemtype = LLVMGetElementType(LLVMTypeOf(value));
636 } else {
637 if (src_channels) {
638 assert(src_channels == 1);
639 chan[0] = value;
640 }
641 elemtype = LLVMTypeOf(value);
642 }
643
644 for (unsigned i = src_channels; i < dst_channels; i++)
645 chan[i] = LLVMGetUndef(elemtype);
646
647 return ac_build_gather_values(ctx, chan, dst_channels);
648 }
649
650 /* Extract components [start, start + channels) from a vector.
651 */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)652 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
653 unsigned channels)
654 {
655 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
656
657 for (unsigned i = 0; i < channels; i++)
658 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
659
660 return ac_build_gather_values(ctx, chan, channels);
661 }
662
663 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
664 * with undef. Extract at most num_channels components from the input.
665 */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)666 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
667 unsigned num_channels)
668 {
669 return ac_build_expand(ctx, value, num_channels, 4);
670 }
671
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)672 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
673 {
674 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
675 const char *name;
676
677 if (type_size == 2)
678 name = "llvm.rint.f16";
679 else if (type_size == 4)
680 name = "llvm.rint.f32";
681 else
682 name = "llvm.rint.f64";
683
684 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, 0);
685 }
686
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)687 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
688 {
689 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
690 const char *name;
691
692 if (type_size == 2)
693 name = "llvm.amdgcn.rcp.f16";
694 else if (type_size == 4)
695 name = "llvm.amdgcn.rcp.f32";
696 else
697 name = "llvm.amdgcn.rcp.f64";
698
699 LLVMValueRef rcp =
700 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
701
702 return LLVMBuildFMul(ctx->builder, num, rcp, "");
703 }
704
705 /* See fast_idiv_by_const.h. */
706 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)707 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
708 LLVMValueRef multiplier, LLVMValueRef pre_shift,
709 LLVMValueRef post_shift, LLVMValueRef increment)
710 {
711 LLVMBuilderRef builder = ctx->builder;
712
713 num = LLVMBuildLShr(builder, num, pre_shift, "");
714 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
715 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
716 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
717 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
718 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
719 return LLVMBuildLShr(builder, num, post_shift, "");
720 }
721
722 /* See fast_idiv_by_const.h. */
723 /* If num != UINT_MAX, this more efficient version can be used. */
724 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)725 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
726 LLVMValueRef multiplier, LLVMValueRef pre_shift,
727 LLVMValueRef post_shift, LLVMValueRef increment)
728 {
729 LLVMBuilderRef builder = ctx->builder;
730
731 num = LLVMBuildLShr(builder, num, pre_shift, "");
732 num = LLVMBuildNUWAdd(builder, num, increment, "");
733 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
734 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
735 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
736 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
737 return LLVMBuildLShr(builder, num, post_shift, "");
738 }
739
740 /* See fast_idiv_by_const.h. */
741 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)742 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
743 LLVMValueRef multiplier, LLVMValueRef post_shift)
744 {
745 LLVMBuilderRef builder = ctx->builder;
746
747 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
748 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
749 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
750 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
751 return LLVMBuildLShr(builder, num, post_shift, "");
752 }
753
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)754 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
755 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
756 LLVMValueRef j)
757 {
758 LLVMValueRef args[5];
759
760 if (ctx->gfx_level >= GFX11) {
761 LLVMValueRef p;
762 LLVMValueRef p10;
763
764 args[0] = llvm_chan;
765 args[1] = attr_number;
766 args[2] = params;
767
768 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
769 ctx->f32, args, 3, 0);
770
771 args[0] = p;
772 args[1] = i;
773 args[2] = p;
774
775 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
776 ctx->f32, args, 3, 0);
777
778 args[0] = p;
779 args[1] = j;
780 args[2] = p10;
781
782 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
783 ctx->f32, args, 3, 0);
784
785 } else {
786 LLVMValueRef p1;
787
788 args[0] = i;
789 args[1] = llvm_chan;
790 args[2] = attr_number;
791 args[3] = params;
792
793 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
794 ctx->f32, args, 4, 0);
795
796 args[0] = p1;
797 args[1] = j;
798 args[2] = llvm_chan;
799 args[3] = attr_number;
800 args[4] = params;
801
802 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
803 ctx->f32, args, 5, 0);
804 }
805 }
806
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)807 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
808 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
809 LLVMValueRef j, bool high_16bits)
810 {
811 LLVMValueRef args[6];
812
813 if (ctx->gfx_level >= GFX11) {
814 LLVMValueRef p;
815 LLVMValueRef p10;
816
817 args[0] = llvm_chan;
818 args[1] = attr_number;
819 args[2] = params;
820
821 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
822 ctx->f32, args, 3, 0);
823
824 args[0] = p;
825 args[1] = i;
826 args[2] = p;
827 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
828
829 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
830 ctx->f32, args, 4, 0);
831
832 args[0] = p;
833 args[1] = j;
834 args[2] = p10;
835 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
836
837 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
838 ctx->f16, args, 4, 0);
839
840 } else {
841 LLVMValueRef p1;
842
843 args[0] = i;
844 args[1] = llvm_chan;
845 args[2] = attr_number;
846 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
847 args[4] = params;
848
849 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
850 0);
851
852 args[0] = p1;
853 args[1] = j;
854 args[2] = llvm_chan;
855 args[3] = attr_number;
856 args[4] = high_16bits ? ctx->i1true : ctx->i1false;
857 args[5] = params;
858
859 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
860 0);
861 }
862 }
863
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)864 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
865 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
866 LLVMValueRef params)
867 {
868 LLVMValueRef args[4];
869
870 if (ctx->gfx_level >= GFX11) {
871 LLVMValueRef p;
872
873 args[0] = llvm_chan;
874 args[1] = attr_number;
875 args[2] = params;
876
877 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
878 ctx->f32, args, 3, 0);
879 p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
880 p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter);
881 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
882 } else {
883 args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
884 args[1] = llvm_chan;
885 args[2] = attr_number;
886 args[3] = params;
887
888 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
889 }
890 }
891
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index)892 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef base_ptr,
893 LLVMValueRef index)
894 {
895 return LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
896 }
897
ac_build_gep0_type(LLVMTypeRef pointee_type,LLVMValueRef index)898 LLVMTypeRef ac_build_gep0_type(LLVMTypeRef pointee_type, LLVMValueRef index)
899 {
900 switch (LLVMGetTypeKind(pointee_type)) {
901 case LLVMPointerTypeKind:
902 return pointee_type;
903 case LLVMArrayTypeKind:
904 /* If input is a pointer to an array GEP2 will return a pointer to
905 * the array elements type.
906 */
907 return LLVMGetElementType(pointee_type);
908 case LLVMStructTypeKind:
909 /* If input is a pointer to a struct, GEP2 will return a pointer to
910 * the index-nth field, so get its type.
911 */
912 return LLVMStructGetTypeAtIndex(pointee_type, LLVMConstIntGetZExtValue(index));
913 default:
914 /* gep0 shouldn't receive any other types. */
915 assert(false);
916 }
917 return NULL;
918 }
919
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)920 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
921 {
922 LLVMValueRef indices[2] = {
923 ctx->i32_0,
924 index,
925 };
926
927 return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
928 }
929
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef ptr,LLVMValueRef index)930 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef ptr, LLVMValueRef index)
931 {
932 return LLVMBuildGEP2(ctx->builder, type, ptr, &index, 1, "");
933 }
934
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)935 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
936 LLVMValueRef value)
937 {
938 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
939 }
940
941 /**
942 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
943 * It's equivalent to doing a load from &base_ptr[index].
944 *
945 * \param base_ptr Where the array starts.
946 * \param index The element index into the array.
947 * \param uniform Whether the base_ptr and index can be assumed to be
948 * dynamically uniform (i.e. load to an SGPR)
949 * \param invariant Whether the load is invariant (no other opcodes affect it)
950 * \param no_unsigned_wraparound
951 * For all possible re-associations and re-distributions of an expression
952 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
953 * without inbounds in base_ptr), this parameter is true if "addr + offset"
954 * does not result in an unsigned integer wraparound. This is used for
955 * optimal code generation of 32-bit pointer arithmetic.
956 *
957 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
958 * integer wraparound can't be an imm offset in s_load_dword, because
959 * the instruction performs "addr + offset" in 64 bits.
960 *
961 * Expected usage for bindless textures by chaining GEPs:
962 * // possible unsigned wraparound, don't use InBounds:
963 * ptr1 = LLVMBuildGEP(base_ptr, index);
964 * image = load(ptr1); // becomes "s_load ptr1, 0"
965 *
966 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
967 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
968 */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)969 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
970 LLVMValueRef base_ptr, LLVMValueRef index,
971 bool uniform, bool invariant, bool no_unsigned_wraparound)
972 {
973 LLVMValueRef pointer, result;
974
975 if (no_unsigned_wraparound &&
976 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
977 pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
978 else
979 pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
980
981 if (uniform)
982 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
983 result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
984 if (invariant)
985 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
986 LLVMSetAlignment(result, 4);
987 return result;
988 }
989
ac_build_load(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)990 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
991 {
992 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, false, false);
993 }
994
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)995 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
996 LLVMValueRef index)
997 {
998 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
999 }
1000
1001 /* This assumes that there is no unsigned integer wraparound during the address
1002 * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)1003 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
1004 LLVMValueRef index)
1005 {
1006 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
1007 }
1008
1009 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)1010 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
1011 {
1012 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
1013 }
1014
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)1015 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
1016 {
1017 return ac_get_hw_cache_flags(ctx->info, access).value;
1018 }
1019
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)1020 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1021 LLVMValueRef data, LLVMValueRef vindex,
1022 LLVMValueRef voffset, LLVMValueRef soffset,
1023 enum gl_access_qualifier access, bool use_format)
1024 {
1025 LLVMValueRef args[6];
1026 int idx = 0;
1027 args[idx++] = data;
1028 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1029 if (vindex)
1030 args[idx++] = vindex ? vindex : ctx->i32_0;
1031 args[idx++] = voffset ? voffset : ctx->i32_0;
1032 args[idx++] = soffset ? soffset : ctx->i32_0;
1033 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
1034 const char *indexing_kind = vindex ? "struct" : "raw";
1035 char name[256], type_name[8];
1036
1037 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1038
1039 if (use_format) {
1040 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1041 type_name);
1042 } else {
1043 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1044 }
1045
1046 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
1047 }
1048
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)1049 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1050 LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
1051 {
1052 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
1053 }
1054
1055 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1056 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1057 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1058 enum gl_access_qualifier access)
1059 {
1060 unsigned num_channels = ac_get_llvm_num_components(vdata);
1061
1062 /* Split 3 channel stores if unsupported. */
1063 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1064 LLVMValueRef v[3], v01, voffset2;
1065
1066 for (int i = 0; i < 3; i++) {
1067 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1068 }
1069 v01 = ac_build_gather_values(ctx, v, 2);
1070
1071 voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1072 LLVMConstInt(ctx->i32, 8, 0), "");
1073
1074 ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
1075 ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
1076 return;
1077 }
1078
1079 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1080 access, false);
1081 }
1082
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)1083 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1084 LLVMValueRef vindex, LLVMValueRef voffset,
1085 LLVMValueRef soffset, unsigned num_channels,
1086 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1087 bool can_speculate, bool use_format)
1088 {
1089 LLVMValueRef args[5];
1090 int idx = 0;
1091 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1092 if (vindex)
1093 args[idx++] = vindex;
1094 args[idx++] = voffset ? voffset : ctx->i32_0;
1095 args[idx++] = soffset ? soffset : ctx->i32_0;
1096 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1097 unsigned func =
1098 !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1099 const char *indexing_kind = vindex ? "struct" : "raw";
1100 char name[256], type_name[8];
1101
1102 /* D16 is only supported on gfx8+ */
1103 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1104 ctx->gfx_level >= GFX8);
1105
1106 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1107 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1108
1109 if (use_format) {
1110 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1111 type_name);
1112 } else {
1113 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1114 }
1115
1116 LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
1117 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1118 if (func > num_channels)
1119 result = ac_trim_vector(ctx, result, num_channels);
1120 return result;
1121 }
1122
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)1123 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1124 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1125 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1126 bool can_speculate, bool allow_smem)
1127 {
1128 if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
1129 assert(vindex == NULL);
1130
1131 LLVMValueRef result[32];
1132
1133 LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1134 if (soffset)
1135 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1136
1137 char name[256], type_name[8];
1138 ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
1139 snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
1140
1141 LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
1142
1143 for (int i = 0; i < num_channels; i++) {
1144 if (i) {
1145 offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
1146 }
1147 LLVMValueRef args[3] = {
1148 rsrc,
1149 offset,
1150 LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
1151 ACCESS_TYPE_SMEM), 0),
1152 };
1153 result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
1154 }
1155 if (num_channels == 1)
1156 return result[0];
1157
1158 return ac_build_gather_values(ctx, result, num_channels);
1159 }
1160
1161 /* LLVM is unable to select instructions for num_channels > 4, so we
1162 * workaround that by manually splitting larger buffer loads.
1163 */
1164 LLVMValueRef result = NULL;
1165 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1166 fetch_num_channels = MIN2(4, num_channels - i);
1167 LLVMValueRef fetch_voffset =
1168 LLVMBuildAdd(ctx->builder, voffset,
1169 LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1170 LLVMValueRef item =
1171 ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1172 channel_type, access, can_speculate, false);
1173 result = ac_build_concat(ctx, result, item);
1174 }
1175
1176 return result;
1177 }
1178
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1179 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1180 LLVMValueRef vindex, LLVMValueRef voffset,
1181 unsigned num_channels, enum gl_access_qualifier access,
1182 bool can_speculate, bool d16, bool tfe)
1183 {
1184 if (tfe) {
1185 assert(!d16);
1186
1187 unsigned cache_flags = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD);
1188
1189 char code[256];
1190 /* The definition in the assembly and the one in the constraint string
1191 * differs because of an assembler bug.
1192 */
1193 snprintf(code, sizeof(code),
1194 "v_mov_b32 v0, 0\n"
1195 "v_mov_b32 v1, 0\n"
1196 "v_mov_b32 v2, 0\n"
1197 "v_mov_b32 v3, 0\n"
1198 "v_mov_b32 v4, 0\n"
1199 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1200 "s_waitcnt vmcnt(0)",
1201 cache_flags & ac_glc ? "glc" : "",
1202 cache_flags & ac_slc ? "slc" : "",
1203 cache_flags & ac_dlc ? "dlc" : "");
1204
1205 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1206 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1207 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1208
1209 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1210 voffset ? voffset : ctx->i32_0};
1211
1212 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1213 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1214 LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1215
1216 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1217 ac_llvm_extract_elem(ctx, res, 4));
1218 }
1219
1220 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1221 num_channels, d16 ? ctx->f16 : ctx->f32, access,
1222 can_speculate, true);
1223 }
1224
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1225 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1226 LLVMValueRef vindex, LLVMValueRef voffset,
1227 LLVMValueRef soffset, unsigned num_channels,
1228 unsigned tbuffer_format, LLVMTypeRef channel_type,
1229 enum gl_access_qualifier access, bool can_speculate)
1230 {
1231 LLVMValueRef args[6];
1232 int idx = 0;
1233 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1234 if (vindex)
1235 args[idx++] = vindex;
1236 args[idx++] = voffset ? voffset : ctx->i32_0;
1237 args[idx++] = soffset ? soffset : ctx->i32_0;
1238 args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1239 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1240 const char *indexing_kind = vindex ? "struct" : "raw";
1241 char name[256], type_name[8];
1242
1243 LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1244 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1245
1246 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1247
1248 return ac_build_intrinsic(ctx, name, type, args, idx,
1249 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1250 }
1251
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1252 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1253 LLVMValueRef vidx, LLVMValueRef base_voffset,
1254 LLVMValueRef soffset,
1255 const enum pipe_format format,
1256 unsigned channel_bit_size,
1257 unsigned const_offset,
1258 unsigned align_offset,
1259 unsigned align_mul,
1260 unsigned num_channels,
1261 enum gl_access_qualifier access,
1262 bool can_speculate)
1263 {
1264 const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1265 const unsigned max_channels = vtx_info->num_channels;
1266 LLVMValueRef voffset_plus_const =
1267 LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1268
1269 /* Split the specified load into several MTBUF instructions,
1270 * according to a safe fetch size determined by aligmnent information.
1271 */
1272 LLVMValueRef result = NULL;
1273 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1274 /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1275 assert(i == 0 || vtx_info->chan_byte_size);
1276
1277 const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1278 const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1279 const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1280
1281 fetch_num_channels =
1282 ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1283 max_channels - i, fetch_alignment, num_channels - i);
1284 const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1285 LLVMValueRef fetch_voffset =
1286 LLVMBuildAdd(ctx->builder, voffset_plus_const,
1287 LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1288 LLVMValueRef item =
1289 ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1290 fetch_num_channels, fetch_format, ctx->i32,
1291 access, can_speculate);
1292 result = ac_build_concat(ctx, result, item);
1293 }
1294
1295 /*
1296 * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1297 * manually truncate them to the required size.
1298 * TODO: Do this in NIR instead.
1299 */
1300 const struct util_format_description *desc = util_format_description(format);
1301 bool is_float = !desc->channel[0].pure_integer;
1302
1303 if (channel_bit_size == 16) {
1304 LLVMValueRef channels[4];
1305 for (unsigned i = 0; i < num_channels; i++) {
1306 LLVMValueRef channel = result;
1307 if (num_channels > 1)
1308 channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1309
1310 if (is_float) {
1311 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1312 channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1313 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1314 } else {
1315 channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1316 }
1317 channels[i] = channel;
1318 }
1319 result = ac_build_gather_values(ctx, channels, num_channels);
1320 }
1321
1322 return result;
1323 }
1324
1325
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1326 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1327 LLVMValueRef voffset, LLVMValueRef soffset,
1328 enum gl_access_qualifier access)
1329 {
1330 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1331 access, false, false);
1332 }
1333
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1334 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1335 LLVMValueRef voffset, LLVMValueRef soffset,
1336 enum gl_access_qualifier access)
1337 {
1338 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1339 false, false);
1340 }
1341
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1342 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1343 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1344 enum gl_access_qualifier access)
1345 {
1346 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1347
1348 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1349 }
1350
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1351 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1352 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1353 {
1354 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1355
1356 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1357 }
1358
1359 /**
1360 * Set range metadata on an instruction. This can only be used on load and
1361 * call instructions. If you know an instruction can only produce the values
1362 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1363 * \p lo is the minimum value inclusive.
1364 * \p hi is the maximum value exclusive.
1365 */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1366 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1367 unsigned hi)
1368 {
1369 LLVMValueRef range_md, md_args[2];
1370 LLVMTypeRef type = LLVMTypeOf(value);
1371 LLVMContextRef context = LLVMGetTypeContext(type);
1372
1373 md_args[0] = LLVMConstInt(type, lo, false);
1374 md_args[1] = LLVMConstInt(type, hi, false);
1375 range_md = LLVMMDNodeInContext(context, md_args, 2);
1376 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1377 }
1378
ac_get_thread_id(struct ac_llvm_context * ctx)1379 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1380 {
1381 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1382 }
1383
1384 /*
1385 * AMD GCN implements derivatives using the local data store (LDS)
1386 * All writes to the LDS happen in all executing threads at
1387 * the same time. TID is the Thread ID for the current
1388 * thread and is a value between 0 and 63, representing
1389 * the thread's position in the wavefront.
1390 *
1391 * For the pixel shader threads are grouped into quads of four pixels.
1392 * The TIDs of the pixels of a quad are:
1393 *
1394 * +------+------+
1395 * |4n + 0|4n + 1|
1396 * +------+------+
1397 * |4n + 2|4n + 3|
1398 * +------+------+
1399 *
1400 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1401 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1402 * the current pixel's column, and masking with 0xfffffffe yields the TID
1403 * of the left pixel of the current pixel's row.
1404 *
1405 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1406 * adding 2 yields the TID of the pixel below the top pixel.
1407 */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1408 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1409 {
1410 unsigned tl_lanes[4], trbl_lanes[4];
1411 char name[32], type[8];
1412 LLVMValueRef tl, trbl;
1413 LLVMTypeRef result_type;
1414 LLVMValueRef result;
1415
1416 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1417
1418 if (result_type == ctx->f16)
1419 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1420 else if (result_type == ctx->v2f16)
1421 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1422
1423 for (unsigned i = 0; i < 4; ++i) {
1424 tl_lanes[i] = i & mask;
1425 trbl_lanes[i] = (i & mask) + idx;
1426 }
1427
1428 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1429 trbl =
1430 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1431
1432 if (result_type == ctx->f16) {
1433 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1434 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1435 }
1436
1437 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1438 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1439 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1440
1441 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1442 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1443
1444 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1445 }
1446
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1447 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1448 {
1449 LLVMValueRef args[2];
1450 args[0] = LLVMConstInt(ctx->i32, imm, false);
1451 args[1] = m0_content;
1452 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1453 }
1454
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1455 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1456 {
1457 LLVMValueRef msb =
1458 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1459
1460 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1461 * the index from LSB. Invert it by doing "31 - msb". */
1462 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1463
1464 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1465 LLVMValueRef cond =
1466 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1467 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1468
1469 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1470 }
1471
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1472 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1473 bool rev)
1474 {
1475 const char *intrin_name;
1476 LLVMTypeRef type;
1477 LLVMValueRef highest_bit;
1478 LLVMValueRef zero;
1479 unsigned bitsize;
1480
1481 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1482 switch (bitsize) {
1483 case 64:
1484 intrin_name = "llvm.ctlz.i64";
1485 type = ctx->i64;
1486 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1487 zero = ctx->i64_0;
1488 break;
1489 case 32:
1490 intrin_name = "llvm.ctlz.i32";
1491 type = ctx->i32;
1492 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1493 zero = ctx->i32_0;
1494 break;
1495 case 16:
1496 intrin_name = "llvm.ctlz.i16";
1497 type = ctx->i16;
1498 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1499 zero = ctx->i16_0;
1500 break;
1501 case 8:
1502 intrin_name = "llvm.ctlz.i8";
1503 type = ctx->i8;
1504 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1505 zero = ctx->i8_0;
1506 break;
1507 default:
1508 unreachable("invalid bitsize");
1509 break;
1510 }
1511
1512 LLVMValueRef params[2] = {
1513 arg,
1514 ctx->i1true,
1515 };
1516
1517 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1518
1519 if (!rev) {
1520 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1521 * the index from LSB. Invert it by doing "31 - msb". */
1522 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1523 }
1524
1525 if (bitsize == 64) {
1526 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1527 } else if (bitsize < 32) {
1528 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1529 }
1530
1531 /* check for zero */
1532 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1533 LLVMConstInt(ctx->i32, -1, true), msb, "");
1534 }
1535
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1536 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1537 {
1538 char name[64], type[64];
1539
1540 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1541 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1542 LLVMValueRef args[2] = {a, b};
1543 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1544 }
1545
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1546 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1547 {
1548 char name[64], type[64];
1549
1550 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1551 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1552 LLVMValueRef args[2] = {a, b};
1553 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1554 }
1555
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1556 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1557 {
1558 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1559 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1560 }
1561
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1562 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1563 {
1564 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1565 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1566 }
1567
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1568 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1569 {
1570 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1571 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1572 }
1573
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1574 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1575 {
1576 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1577 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1578 }
1579
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1580 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1581 {
1582 LLVMTypeRef t = LLVMTypeOf(value);
1583 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1584 LLVMConstReal(t, 1.0));
1585 }
1586
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1587 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1588 {
1589 LLVMValueRef args[9];
1590
1591 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1592 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1593
1594 if (a->compr) {
1595 assert(ctx->gfx_level < GFX11);
1596
1597 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1598 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1599 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1600 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1601
1602 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1603 } else {
1604 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1605 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1606 args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1607 args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1608 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1609 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1610
1611 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1612 }
1613 }
1614
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1615 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1616 {
1617 struct ac_export_args args;
1618
1619 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1620 * for discard.
1621 */
1622 if (ctx->gfx_level >= GFX10 && !uses_discard)
1623 return;
1624
1625 args.enabled_channels = 0x0; /* enabled channels */
1626 args.valid_mask = 1; /* whether the EXEC mask is valid */
1627 args.done = 1; /* DONE bit */
1628 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1629 args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1630 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1631 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1632 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1633 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1634 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1635
1636 ac_build_export(ctx, &args);
1637 }
1638
ac_num_coords(enum ac_image_dim dim)1639 static unsigned ac_num_coords(enum ac_image_dim dim)
1640 {
1641 switch (dim) {
1642 case ac_image_1d:
1643 return 1;
1644 case ac_image_2d:
1645 case ac_image_1darray:
1646 return 2;
1647 case ac_image_3d:
1648 case ac_image_cube:
1649 case ac_image_2darray:
1650 case ac_image_2dmsaa:
1651 return 3;
1652 case ac_image_2darraymsaa:
1653 return 4;
1654 default:
1655 unreachable("ac_num_coords: bad dim");
1656 }
1657 }
1658
ac_num_derivs(enum ac_image_dim dim)1659 static unsigned ac_num_derivs(enum ac_image_dim dim)
1660 {
1661 switch (dim) {
1662 case ac_image_1d:
1663 case ac_image_1darray:
1664 return 2;
1665 case ac_image_2d:
1666 case ac_image_2darray:
1667 case ac_image_cube:
1668 return 4;
1669 case ac_image_3d:
1670 return 6;
1671 case ac_image_2dmsaa:
1672 case ac_image_2darraymsaa:
1673 default:
1674 unreachable("derivatives not supported");
1675 }
1676 }
1677
get_atomic_name(enum ac_atomic_op op)1678 static const char *get_atomic_name(enum ac_atomic_op op)
1679 {
1680 switch (op) {
1681 case ac_atomic_swap:
1682 return "swap";
1683 case ac_atomic_add:
1684 return "add";
1685 case ac_atomic_sub:
1686 return "sub";
1687 case ac_atomic_smin:
1688 return "smin";
1689 case ac_atomic_umin:
1690 return "umin";
1691 case ac_atomic_smax:
1692 return "smax";
1693 case ac_atomic_umax:
1694 return "umax";
1695 case ac_atomic_and:
1696 return "and";
1697 case ac_atomic_or:
1698 return "or";
1699 case ac_atomic_xor:
1700 return "xor";
1701 case ac_atomic_inc_wrap:
1702 return "inc";
1703 case ac_atomic_dec_wrap:
1704 return "dec";
1705 case ac_atomic_fmin:
1706 return "fmin";
1707 case ac_atomic_fmax:
1708 return "fmax";
1709 }
1710 unreachable("bad atomic op");
1711 }
1712
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1713 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1714 {
1715 const char *overload[3] = {"", "", ""};
1716 unsigned num_overloads = 0;
1717 LLVMValueRef args[18];
1718 unsigned num_args = 0;
1719 enum ac_image_dim dim = a->dim;
1720
1721 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1722 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1723 a->opcode != ac_image_store_mip) ||
1724 a->lod);
1725 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1726 (!a->compare && !a->offset));
1727 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1728 a->opcode == ac_image_get_lod) ||
1729 !a->bias);
1730 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1731 1);
1732 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1733 assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1734 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1735 a->opcode != ac_image_get_resinfo));
1736 assert(!a->a16 || ctx->gfx_level >= GFX9);
1737 assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1738
1739 assert(!a->offset ||
1740 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1741 assert(!a->bias ||
1742 ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
1743 assert(!a->compare ||
1744 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1745 assert(!a->derivs[0] ||
1746 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1747 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1748 assert(!a->coords[0] ||
1749 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1750 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1751 assert(!a->lod ||
1752 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1753 (a->opcode == ac_image_get_resinfo ||
1754 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1755 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1756 assert(!a->min_lod ||
1757 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1758 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1759
1760 if (a->opcode == ac_image_get_lod) {
1761 switch (dim) {
1762 case ac_image_1darray:
1763 dim = ac_image_1d;
1764 break;
1765 case ac_image_2darray:
1766 case ac_image_cube:
1767 dim = ac_image_2d;
1768 break;
1769 default:
1770 break;
1771 }
1772 }
1773
1774 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1775 a->opcode == ac_image_get_lod;
1776 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1777 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1778 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1779 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1780 uint8_t dmask = a->dmask;
1781 LLVMTypeRef data_type;
1782 char data_type_str[32];
1783
1784 if (atomic) {
1785 data_type = LLVMTypeOf(a->data[0]);
1786 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1787 /* Image stores might have been shrunk using the format. */
1788 data_type = LLVMTypeOf(a->data[0]);
1789 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1790 } else {
1791 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1792 }
1793
1794 if (a->tfe) {
1795 data_type = LLVMStructTypeInContext(
1796 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1797 }
1798
1799 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1800 args[num_args++] = a->data[0];
1801 if (a->opcode == ac_image_atomic_cmpswap)
1802 args[num_args++] = a->data[1];
1803 }
1804
1805 if (!atomic)
1806 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1807
1808 if (a->offset)
1809 args[num_args++] = ac_to_integer(ctx, a->offset);
1810 if (a->bias) {
1811 args[num_args++] = ac_to_float(ctx, a->bias);
1812 overload[num_overloads++] = ".f32";
1813 }
1814 if (a->compare)
1815 args[num_args++] = ac_to_float(ctx, a->compare);
1816 if (a->derivs[0]) {
1817 unsigned count = ac_num_derivs(dim);
1818 for (unsigned i = 0; i < count; ++i)
1819 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1820 overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1821 }
1822 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1823 for (unsigned i = 0; i < num_coords; ++i)
1824 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1825 if (a->lod)
1826 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1827 if (a->min_lod)
1828 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1829
1830 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1831
1832 args[num_args++] = a->resource;
1833 if (sample) {
1834 args[num_args++] = a->sampler;
1835 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1836 }
1837
1838 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1839 args[num_args++] = LLVMConstInt(
1840 ctx->i32, get_cache_flags(ctx,
1841 a->access |
1842 (atomic ? ACCESS_TYPE_ATOMIC :
1843 load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1844 false);
1845
1846 const char *name;
1847 const char *atomic_subop = "";
1848 switch (a->opcode) {
1849 case ac_image_sample:
1850 name = "sample";
1851 break;
1852 case ac_image_gather4:
1853 name = "gather4";
1854 break;
1855 case ac_image_load:
1856 name = "load";
1857 break;
1858 case ac_image_load_mip:
1859 name = "load.mip";
1860 break;
1861 case ac_image_store:
1862 name = "store";
1863 break;
1864 case ac_image_store_mip:
1865 name = "store.mip";
1866 break;
1867 case ac_image_atomic:
1868 name = "atomic.";
1869 atomic_subop = get_atomic_name(a->atomic);
1870 break;
1871 case ac_image_atomic_cmpswap:
1872 name = "atomic.";
1873 atomic_subop = "cmpswap";
1874 break;
1875 case ac_image_get_lod:
1876 name = "getlod";
1877 break;
1878 case ac_image_get_resinfo:
1879 name = "getresinfo";
1880 break;
1881 default:
1882 unreachable("invalid image opcode");
1883 }
1884
1885 const char *dimname;
1886 switch (dim) {
1887 case ac_image_1d:
1888 dimname = "1d";
1889 break;
1890 case ac_image_2d:
1891 dimname = "2d";
1892 break;
1893 case ac_image_3d:
1894 dimname = "3d";
1895 break;
1896 case ac_image_cube:
1897 dimname = "cube";
1898 break;
1899 case ac_image_1darray:
1900 dimname = "1darray";
1901 break;
1902 case ac_image_2darray:
1903 dimname = "2darray";
1904 break;
1905 case ac_image_2dmsaa:
1906 dimname = "2dmsaa";
1907 break;
1908 case ac_image_2darraymsaa:
1909 dimname = "2darraymsaa";
1910 break;
1911 default:
1912 unreachable("invalid dim");
1913 }
1914
1915 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1916
1917 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1918 char intr_name[96];
1919 snprintf(intr_name, sizeof(intr_name),
1920 "llvm.amdgcn.image.%s%s" /* base name */
1921 "%s%s%s%s" /* sample/gather modifiers */
1922 ".%s.%s%s%s%s", /* dimension and type overloads */
1923 name, atomic_subop, a->compare ? ".c" : "",
1924 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1925 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1926 data_type_str, overload[0], overload[1], overload[2]);
1927
1928 LLVMTypeRef retty;
1929 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1930 retty = ctx->voidt;
1931 else
1932 retty = data_type;
1933
1934 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1935 if (a->tfe) {
1936 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1937 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1938 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1939 }
1940
1941 if (!sample && !atomic && retty != ctx->voidt)
1942 result = ac_to_integer(ctx, result);
1943
1944 return result;
1945 }
1946
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)1947 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
1948 {
1949 LLVMValueRef samples;
1950
1951 /* Read the samples from the descriptor directly.
1952 * Hardware doesn't have any instruction for this.
1953 */
1954 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
1955 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
1956 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
1957 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
1958 return samples;
1959 }
1960
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1961 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1962 {
1963 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1964 }
1965
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1966 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1967 {
1968 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1969 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1970 }
1971
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1972 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1973 {
1974 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1975 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1976 }
1977
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1978 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1979 LLVMValueRef args[2])
1980 {
1981 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1982 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1983 LLVMValueRef code = LLVMConstInlineAsm(calltype,
1984 ctx->gfx_level >= GFX11 ?
1985 "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1986 "v_cvt_pknorm_i16_f16 $0, $1, $2",
1987 "=v,v,v", false, false);
1988 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1989 }
1990
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1991 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
1992 LLVMValueRef args[2])
1993 {
1994 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1995 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1996 LLVMValueRef code = LLVMConstInlineAsm(calltype,
1997 ctx->gfx_level >= GFX11 ?
1998 "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
1999 "v_cvt_pknorm_u16_f16 $0, $1, $2",
2000 "=v,v,v", false, false);
2001 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2002 }
2003
2004 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2005 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2006 bool hi)
2007 {
2008 assert(bits == 8 || bits == 10 || bits == 16);
2009
2010 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2011 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2012 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2013 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2014
2015 /* Clamp. */
2016 if (bits != 16) {
2017 for (int i = 0; i < 2; i++) {
2018 bool alpha = hi && i == 1;
2019 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2020 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2021 }
2022 }
2023
2024 LLVMValueRef res =
2025 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
2026 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2027 }
2028
2029 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2030 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2031 bool hi)
2032 {
2033 assert(bits == 8 || bits == 10 || bits == 16);
2034
2035 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2036 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2037
2038 /* Clamp. */
2039 if (bits != 16) {
2040 for (int i = 0; i < 2; i++) {
2041 bool alpha = hi && i == 1;
2042 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2043 }
2044 }
2045
2046 LLVMValueRef res =
2047 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
2048 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2049 }
2050
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2051 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2052 {
2053 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
2054 }
2055
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2056 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2057 {
2058 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2059 }
2060
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2061 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2062 LLVMValueRef width, bool is_signed)
2063 {
2064 LLVMValueRef args[] = {
2065 input,
2066 offset,
2067 width,
2068 };
2069
2070 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2071 ctx->i32, args, 3, 0);
2072 }
2073
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2074 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2075 LLVMValueRef s2)
2076 {
2077 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2078 }
2079
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2080 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2081 LLVMValueRef s2)
2082 {
2083 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2084 if (ctx->gfx_level >= GFX10)
2085 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
2086
2087 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2088 }
2089
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2090 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2091 {
2092 if (!wait_flags)
2093 return;
2094
2095 unsigned expcnt = 7;
2096 unsigned lgkmcnt = 63;
2097 unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2098 unsigned vscnt = 63;
2099
2100 if (wait_flags & AC_WAIT_EXP)
2101 expcnt = 0;
2102 if (wait_flags & AC_WAIT_LGKM)
2103 lgkmcnt = 0;
2104 if (wait_flags & AC_WAIT_VLOAD)
2105 vmcnt = 0;
2106
2107 if (wait_flags & AC_WAIT_VSTORE) {
2108 if (ctx->gfx_level >= GFX10)
2109 vscnt = 0;
2110 else
2111 vmcnt = 0;
2112 }
2113
2114 /* There is no intrinsic for vscnt(0), so use a fence. */
2115 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2116 vscnt == 0) {
2117 assert(!(wait_flags & AC_WAIT_EXP));
2118 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2119 return;
2120 }
2121
2122 unsigned simm16;
2123
2124 if (ctx->gfx_level >= GFX11)
2125 simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2126 else
2127 simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2128
2129 LLVMValueRef args[1] = {
2130 LLVMConstInt(ctx->i32, simm16, false),
2131 };
2132 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2133 }
2134
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2135 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2136 LLVMTypeRef type)
2137 {
2138 unsigned bitsize = ac_get_elem_bits(ctx, type);
2139 LLVMValueRef zero = LLVMConstReal(type, 0.0);
2140 LLVMValueRef one = LLVMConstReal(type, 1.0);
2141 LLVMValueRef result;
2142
2143 if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2144 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2145 * doesn't expose an intrinsic.
2146 */
2147 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2148 } else {
2149 LLVMTypeRef type;
2150 char *intr;
2151
2152 if (bitsize == 16) {
2153 intr = "llvm.amdgcn.fmed3.f16";
2154 type = ctx->f16;
2155 } else {
2156 assert(bitsize == 32);
2157 intr = "llvm.amdgcn.fmed3.f32";
2158 type = ctx->f32;
2159 }
2160
2161 LLVMValueRef params[] = {
2162 zero,
2163 one,
2164 src,
2165 };
2166
2167 result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2168 }
2169
2170 if (ctx->gfx_level < GFX9 && bitsize == 32) {
2171 /* Only pre-GFX9 chips do not flush denorms. */
2172 result = ac_build_canonicalize(ctx, result, bitsize);
2173 }
2174
2175 return result;
2176 }
2177
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2178 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2179 {
2180 LLVMTypeRef type;
2181 char *intr;
2182
2183 if (bitsize == 16) {
2184 intr = "llvm.amdgcn.fract.f16";
2185 type = ctx->f16;
2186 } else if (bitsize == 32) {
2187 intr = "llvm.amdgcn.fract.f32";
2188 type = ctx->f32;
2189 } else {
2190 intr = "llvm.amdgcn.fract.f64";
2191 type = ctx->f64;
2192 }
2193
2194 LLVMValueRef params[] = {
2195 src0,
2196 };
2197 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2198 }
2199
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2200 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2201 {
2202
2203 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2204 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2205 unsigned vec_size = LLVMGetVectorSize(type);
2206 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2207
2208 for (unsigned i = 0; i < vec_size; i++)
2209 scalars[i] = scalar;
2210 return LLVMConstVector(scalars, vec_size);
2211 }
2212 return LLVMConstInt(type, value, 0);
2213 }
2214
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2215 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2216 {
2217 LLVMTypeRef type = LLVMTypeOf(src0);
2218 LLVMValueRef val;
2219
2220 /* v_med3 is selected only when max is first. (LLVM bug?) */
2221 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2222 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2223 }
2224
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2225 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2226 {
2227 ac_enable_signed_zeros(ctx);
2228 /* (val + 0) converts negative zero to positive zero. */
2229 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2230 ac_disable_signed_zeros(ctx);
2231 return val;
2232 }
2233
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2234 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2235 {
2236 LLVMTypeRef type = LLVMTypeOf(src);
2237 LLVMValueRef pos, neg, dw[2], val;
2238 unsigned bitsize = ac_get_elem_bits(ctx, type);
2239
2240 /* The standard version leads to this:
2241 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2242 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2243 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2244 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2245 *
2246 * The isign version:
2247 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2248 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2249 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2250 *
2251 * (src0 + 0) converts negative zero to positive zero.
2252 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2253 *
2254 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2255 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2256 */
2257 if (bitsize == 16 || bitsize == 32) {
2258 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2259 val = ac_build_isign(ctx, val);
2260 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2261 }
2262
2263 assert(bitsize == 64);
2264 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2265 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2266 dw[0] = ctx->i32_0;
2267 dw[1] = LLVMBuildSelect(
2268 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2269 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2270 "");
2271 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2272 }
2273
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2274 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2275 {
2276 LLVMValueRef result;
2277 unsigned bitsize;
2278
2279 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2280
2281 switch (bitsize) {
2282 case 128:
2283 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2284 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2285 break;
2286 case 64:
2287 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2288 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2289 break;
2290 case 32:
2291 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2292 break;
2293 case 16:
2294 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2295 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2296 break;
2297 case 8:
2298 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2299 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2300 break;
2301 default:
2302 unreachable("invalid bitsize");
2303 break;
2304 }
2305
2306 return result;
2307 }
2308
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2309 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2310 {
2311 LLVMValueRef result;
2312 unsigned bitsize;
2313
2314 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2315
2316 switch (bitsize) {
2317 case 64:
2318 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2319 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2320 break;
2321 case 32:
2322 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2323 break;
2324 case 16:
2325 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2326 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2327 break;
2328 case 8:
2329 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2330 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2331 break;
2332 default:
2333 unreachable("invalid bitsize");
2334 break;
2335 }
2336
2337 return result;
2338 }
2339
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2340 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2341 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2342 {
2343 const char *name = "llvm.amdgcn.sudot4";
2344 LLVMValueRef src[6];
2345
2346 src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2347 src[1] = s0;
2348 src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2349 src[3] = s1;
2350 src[4] = s2;
2351 src[5] = LLVMConstInt(ctx->i1, clamp, false);
2352
2353 return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2354 }
2355
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2356 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2357 {
2358 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2359 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2360 }
2361
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2362 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2363 {
2364 unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2365 LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2366 ctx->lds = (struct ac_llvm_pointer) {
2367 .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2368 LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2369 .pointee_type = type
2370 };
2371 }
2372
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2373 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2374 {
2375 LLVMValueRef v = ac_build_gep0(ctx, ctx->lds, dw_addr);
2376 return LLVMBuildLoad2(ctx->builder, ctx->i32, v, "");
2377 }
2378
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2379 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2380 {
2381 value = ac_to_integer(ctx, value);
2382 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2383 }
2384
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2385 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2386 {
2387 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2388 const char *intrin_name;
2389 LLVMTypeRef type;
2390 LLVMValueRef zero;
2391
2392 switch (src0_bitsize) {
2393 case 64:
2394 intrin_name = "llvm.cttz.i64";
2395 type = ctx->i64;
2396 zero = ctx->i64_0;
2397 break;
2398 case 32:
2399 intrin_name = "llvm.cttz.i32";
2400 type = ctx->i32;
2401 zero = ctx->i32_0;
2402 break;
2403 case 16:
2404 intrin_name = "llvm.cttz.i16";
2405 type = ctx->i16;
2406 zero = ctx->i16_0;
2407 break;
2408 case 8:
2409 intrin_name = "llvm.cttz.i8";
2410 type = ctx->i8;
2411 zero = ctx->i8_0;
2412 break;
2413 default:
2414 unreachable("invalid bitsize");
2415 }
2416
2417 LLVMValueRef params[2] = {
2418 src0,
2419
2420 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2421 * add special code to check for x=0. The reason is that
2422 * the LLVM behavior for x=0 is different from what we
2423 * need here. However, LLVM also assumes that ffs(x) is
2424 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2425 * a conditional assignment to handle 0 is still required.
2426 *
2427 * The hardware already implements the correct behavior.
2428 */
2429 ctx->i1true,
2430 };
2431
2432 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2433
2434 if (src0_bitsize == 64) {
2435 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2436 } else if (src0_bitsize < 32) {
2437 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2438 }
2439
2440 /* TODO: We need an intrinsic to skip this conditional. */
2441 /* Check for zero: */
2442 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2443 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2444 }
2445
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2446 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2447 switch (type) {
2448 case AC_ARG_CONST_PTR:
2449 return ctx->i8;
2450 break;
2451 case AC_ARG_CONST_FLOAT_PTR:
2452 return ctx->f32;
2453 break;
2454 case AC_ARG_CONST_PTR_PTR:
2455 return ac_array_in_const32_addr_space(ctx->i8);
2456 break;
2457 case AC_ARG_CONST_DESC_PTR:
2458 return ctx->v4i32;
2459 break;
2460 case AC_ARG_CONST_IMAGE_PTR:
2461 return ctx->v8i32;
2462 default:
2463 /* Other ac_arg_type values aren't pointers. */
2464 assert(false);
2465 return NULL;
2466 }
2467 }
2468
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2469 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2470 {
2471 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2472 }
2473
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2474 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2475 {
2476 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2477 }
2478
get_current_flow(struct ac_llvm_context * ctx)2479 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2480 {
2481 if (ctx->flow->depth > 0)
2482 return &ctx->flow->stack[ctx->flow->depth - 1];
2483 return NULL;
2484 }
2485
get_innermost_loop(struct ac_llvm_context * ctx)2486 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2487 {
2488 for (unsigned i = ctx->flow->depth; i > 0; --i) {
2489 if (ctx->flow->stack[i - 1].loop_entry_block)
2490 return &ctx->flow->stack[i - 1];
2491 }
2492 return NULL;
2493 }
2494
push_flow(struct ac_llvm_context * ctx)2495 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2496 {
2497 struct ac_llvm_flow *flow;
2498
2499 if (ctx->flow->depth >= ctx->flow->depth_max) {
2500 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2501
2502 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2503 ctx->flow->depth_max = new_max;
2504 }
2505
2506 flow = &ctx->flow->stack[ctx->flow->depth];
2507 ctx->flow->depth++;
2508
2509 flow->next_block = NULL;
2510 flow->loop_entry_block = NULL;
2511 return flow;
2512 }
2513
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2514 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2515 {
2516 char buf[32];
2517 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2518 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2519 }
2520
2521 /* Append a basic block at the level of the parent flow.
2522 */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2523 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2524 {
2525 assert(ctx->flow->depth >= 1);
2526
2527 if (ctx->flow->depth >= 2) {
2528 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2529
2530 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2531 }
2532
2533 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2534 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2535 }
2536
2537 /* Emit a branch to the given default target for the current block if
2538 * applicable -- that is, if the current block does not already contain a
2539 * branch from a break or continue.
2540 */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2541 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2542 {
2543 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2544 LLVMBuildBr(builder, target);
2545 }
2546
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2547 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2548 {
2549 struct ac_llvm_flow *flow = push_flow(ctx);
2550 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2551 flow->next_block = append_basic_block(ctx, "ENDLOOP");
2552 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2553 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2554 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2555 }
2556
ac_build_break(struct ac_llvm_context * ctx)2557 void ac_build_break(struct ac_llvm_context *ctx)
2558 {
2559 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2560 LLVMBuildBr(ctx->builder, flow->next_block);
2561 }
2562
ac_build_continue(struct ac_llvm_context * ctx)2563 void ac_build_continue(struct ac_llvm_context *ctx)
2564 {
2565 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2566 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2567 }
2568
ac_build_else(struct ac_llvm_context * ctx,int label_id)2569 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2570 {
2571 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2572 LLVMBasicBlockRef endif_block;
2573
2574 assert(!current_branch->loop_entry_block);
2575
2576 endif_block = append_basic_block(ctx, "ENDIF");
2577 emit_default_branch(ctx->builder, endif_block);
2578
2579 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2580 set_basicblock_name(current_branch->next_block, "else", label_id);
2581
2582 current_branch->next_block = endif_block;
2583 }
2584
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2585 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2586 {
2587 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2588
2589 assert(!current_branch->loop_entry_block);
2590
2591 emit_default_branch(ctx->builder, current_branch->next_block);
2592 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2593 set_basicblock_name(current_branch->next_block, "endif", label_id);
2594
2595 ctx->flow->depth--;
2596 }
2597
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2598 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2599 {
2600 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2601
2602 assert(current_loop->loop_entry_block);
2603
2604 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2605
2606 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2607 set_basicblock_name(current_loop->next_block, "endloop", label_id);
2608 ctx->flow->depth--;
2609 }
2610
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2611 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2612 {
2613 struct ac_llvm_flow *flow = push_flow(ctx);
2614 LLVMBasicBlockRef if_block;
2615
2616 if_block = append_basic_block(ctx, "IF");
2617 flow->next_block = append_basic_block(ctx, "ELSE");
2618 set_basicblock_name(if_block, "if", label_id);
2619 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2620 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2621 }
2622
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2623 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2624 {
2625 LLVMBuilderRef builder = ac->builder;
2626 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2627 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2628 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2629 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2630 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2631 LLVMValueRef res;
2632
2633 if (first_instr) {
2634 LLVMPositionBuilderBefore(first_builder, first_instr);
2635 } else {
2636 LLVMPositionBuilderAtEnd(first_builder, first_block);
2637 }
2638
2639 res = LLVMBuildAlloca(first_builder, type, name);
2640 LLVMDisposeBuilder(first_builder);
2641 return res;
2642 }
2643
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2644 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2645 {
2646 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
2647 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
2648 return ptr;
2649 }
2650
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)2651 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
2652 {
2653 LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
2654 LLVMBuildStore(ac->builder, val, ptr);
2655 return ptr;
2656 }
2657
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)2658 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
2659 {
2660 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2661 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
2662 }
2663
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2664 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2665 {
2666 unsigned num_components = ac_get_llvm_num_components(value);
2667 if (count == num_components)
2668 return value;
2669
2670 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2671 masks[0] = ctx->i32_0;
2672 masks[1] = ctx->i32_1;
2673 for (unsigned i = 2; i < count; i++)
2674 masks[i] = LLVMConstInt(ctx->i32, i, false);
2675
2676 if (count == 1)
2677 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2678
2679 LLVMValueRef swizzle = LLVMConstVector(masks, count);
2680 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2681 }
2682
2683 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2684 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2685 unsigned bitwidth)
2686 {
2687 LLVMValueRef value = param;
2688 if (rshift)
2689 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2690
2691 if (rshift + bitwidth < 32) {
2692 uint64_t mask = (1ull << bitwidth) - 1;
2693 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2694 }
2695
2696 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2697 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2698 return value;
2699 }
2700
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2701 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2702 LLVMValueRef lane, bool with_opt_barrier)
2703 {
2704 LLVMTypeRef type = LLVMTypeOf(src);
2705 LLVMValueRef result;
2706
2707 if (with_opt_barrier)
2708 ac_build_optimization_barrier(ctx, &src, false);
2709
2710 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2711 if (lane)
2712 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2713
2714 result =
2715 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2716 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2717
2718 return LLVMBuildTrunc(ctx->builder, result, type, "");
2719 }
2720
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2721 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2722 LLVMValueRef lane, bool with_opt_barrier)
2723 {
2724 LLVMTypeRef src_type = LLVMTypeOf(src);
2725 src = ac_to_integer(ctx, src);
2726 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2727 LLVMValueRef ret;
2728
2729 if (bits > 32) {
2730 assert(bits % 32 == 0);
2731 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2732 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2733 ret = LLVMGetUndef(vec_type);
2734 for (unsigned i = 0; i < bits / 32; i++) {
2735 LLVMValueRef ret_comp;
2736
2737 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2738
2739 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2740
2741 ret =
2742 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2743 }
2744 } else {
2745 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2746 }
2747
2748 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2749 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2750 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2751 }
2752
2753 /**
2754 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
2755 *
2756 * The optimization barrier is not needed if the value is the same in all lanes
2757 * or if this is called in the outermost block.
2758 *
2759 * @param ctx
2760 * @param src
2761 * @param lane - id of the lane or NULL for the first active lane
2762 * @return value of the lane
2763 */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2764 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
2765 LLVMValueRef lane)
2766 {
2767 return ac_build_readlane_common(ctx, src, lane, false);
2768 }
2769
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2770 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2771 {
2772 return ac_build_readlane_common(ctx, src, lane, true);
2773 }
2774
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2775 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2776 LLVMValueRef lane)
2777 {
2778 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2779 (LLVMValueRef[]){value, lane, src}, 3, 0);
2780 }
2781
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2782 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2783 {
2784 LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2785 LLVMValueRef val;
2786
2787 if (ctx->wave_size == 32) {
2788 if (LLVMTypeOf(mask) == ctx->i64)
2789 mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2790
2791 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2792 (LLVMValueRef[]){mask, add}, 2, 0);
2793 } else {
2794 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2795 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2796 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2797 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2798 (LLVMValueRef[]){mask_lo, add}, 2, 0);
2799 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2800 2, 0);
2801 }
2802
2803 if (add == ctx->i32_0)
2804 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2805
2806 if (LLVM_VERSION_MAJOR < 16) {
2807 /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2808 * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2809 */
2810 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2811 val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2812 }
2813
2814 return val;
2815 }
2816
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2817 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2818 {
2819 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2820 }
2821
2822 enum dpp_ctrl
2823 {
2824 _dpp_quad_perm = 0x000,
2825 _dpp_row_sl = 0x100,
2826 _dpp_row_sr = 0x110,
2827 _dpp_row_rr = 0x120,
2828 dpp_wf_sl1 = 0x130,
2829 dpp_wf_rl1 = 0x134,
2830 dpp_wf_sr1 = 0x138,
2831 dpp_wf_rr1 = 0x13C,
2832 dpp_row_mirror = 0x140,
2833 dpp_row_half_mirror = 0x141,
2834 dpp_row_bcast15 = 0x142,
2835 dpp_row_bcast31 = 0x143
2836 };
2837
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2838 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2839 unsigned lane3)
2840 {
2841 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2842 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2843 }
2844
dpp_row_sr(unsigned amount)2845 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2846 {
2847 assert(amount > 0 && amount < 16);
2848 return _dpp_row_sr | amount;
2849 }
2850
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2851 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2852 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2853 bool bound_ctrl)
2854 {
2855 LLVMTypeRef type = LLVMTypeOf(src);
2856 LLVMValueRef res;
2857
2858 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2859 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2860
2861 res = ac_build_intrinsic(
2862 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2863 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2864 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2865 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2866 6, 0);
2867
2868 return LLVMBuildTrunc(ctx->builder, res, type, "");
2869 }
2870
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2871 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2872 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2873 bool bound_ctrl)
2874 {
2875 LLVMTypeRef src_type = LLVMTypeOf(src);
2876 src = ac_to_integer(ctx, src);
2877 old = ac_to_integer(ctx, old);
2878 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2879 LLVMValueRef ret;
2880 if (bits > 32) {
2881 assert(bits % 32 == 0);
2882 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2883 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2884 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2885 ret = LLVMGetUndef(vec_type);
2886 for (unsigned i = 0; i < bits / 32; i++) {
2887 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2888 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2889 LLVMValueRef ret_comp =
2890 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2891 ret =
2892 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2893 }
2894 } else {
2895 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2896 }
2897 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2898 }
2899
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2900 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2901 uint64_t sel, bool exchange_rows, bool bound_ctrl)
2902 {
2903 LLVMTypeRef type = LLVMTypeOf(src);
2904 LLVMValueRef result;
2905
2906 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2907
2908 LLVMValueRef args[6] = {
2909 src,
2910 src,
2911 LLVMConstInt(ctx->i32, sel, false),
2912 LLVMConstInt(ctx->i32, sel >> 32, false),
2913 ctx->i1true, /* fi */
2914 bound_ctrl ? ctx->i1true : ctx->i1false,
2915 };
2916
2917 result =
2918 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2919 ctx->i32, args, 6, 0);
2920
2921 return LLVMBuildTrunc(ctx->builder, result, type, "");
2922 }
2923
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2924 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2925 bool exchange_rows, bool bound_ctrl)
2926 {
2927 LLVMTypeRef src_type = LLVMTypeOf(src);
2928 src = ac_to_integer(ctx, src);
2929 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2930 LLVMValueRef ret;
2931 if (bits > 32) {
2932 assert(bits % 32 == 0);
2933 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2934 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2935 ret = LLVMGetUndef(vec_type);
2936 for (unsigned i = 0; i < bits / 32; i++) {
2937 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2938 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2939 ret =
2940 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2941 }
2942 } else {
2943 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2944 }
2945 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2946 }
2947
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2948 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2949 {
2950 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2951 return and_mask | (or_mask << 5) | (xor_mask << 10);
2952 }
2953
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2954 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2955 unsigned mask)
2956 {
2957 LLVMTypeRef src_type = LLVMTypeOf(src);
2958 LLVMValueRef ret;
2959
2960 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2961
2962 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2963 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2964 0);
2965
2966 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2967 }
2968
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2969 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2970 {
2971 LLVMTypeRef src_type = LLVMTypeOf(src);
2972 src = ac_to_integer(ctx, src);
2973 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2974 LLVMValueRef ret;
2975 if (bits > 32) {
2976 assert(bits % 32 == 0);
2977 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2978 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2979 ret = LLVMGetUndef(vec_type);
2980 for (unsigned i = 0; i < bits / 32; i++) {
2981 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2982 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
2983 ret =
2984 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2985 }
2986 } else {
2987 ret = _ac_build_ds_swizzle(ctx, src, mask);
2988 }
2989 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2990 }
2991
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)2992 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
2993 {
2994 LLVMTypeRef src_type = LLVMTypeOf(src);
2995 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2996 char name[32], type[8];
2997 LLVMValueRef ret;
2998
2999 src = ac_to_integer(ctx, src);
3000
3001 if (bitsize < 32)
3002 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3003
3004 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3005 snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
3006 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
3007
3008 if (bitsize < 32)
3009 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3010
3011 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3012 }
3013
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3014 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3015 {
3016 return ac_build_mode(ctx, src, "wwm");
3017 }
3018
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)3019 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
3020 {
3021 return ac_build_mode(ctx, src, "wqm");
3022 }
3023
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3024 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3025 LLVMValueRef inactive)
3026 {
3027 char name[33], type[8];
3028 LLVMTypeRef src_type = LLVMTypeOf(src);
3029 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3030 src = ac_to_integer(ctx, src);
3031 inactive = ac_to_integer(ctx, inactive);
3032
3033 if (bitsize < 32) {
3034 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3035 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3036 }
3037
3038 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3039 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3040 LLVMValueRef ret =
3041 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
3042 if (bitsize < 32)
3043 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3044
3045 return ret;
3046 }
3047
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3048 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3049 unsigned type_size)
3050 {
3051
3052 if (type_size == 0) {
3053 switch (op) {
3054 case nir_op_ior:
3055 case nir_op_ixor:
3056 return ctx->i1false;
3057 case nir_op_iand:
3058 return ctx->i1true;
3059 default:
3060 unreachable("bad reduction intrinsic");
3061 }
3062 } else if (type_size == 1) {
3063 switch (op) {
3064 case nir_op_iadd:
3065 return ctx->i8_0;
3066 case nir_op_imul:
3067 return ctx->i8_1;
3068 case nir_op_imin:
3069 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3070 case nir_op_umin:
3071 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3072 case nir_op_imax:
3073 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3074 case nir_op_umax:
3075 return ctx->i8_0;
3076 case nir_op_iand:
3077 return LLVMConstInt(ctx->i8, -1, 0);
3078 case nir_op_ior:
3079 return ctx->i8_0;
3080 case nir_op_ixor:
3081 return ctx->i8_0;
3082 default:
3083 unreachable("bad reduction intrinsic");
3084 }
3085 } else if (type_size == 2) {
3086 switch (op) {
3087 case nir_op_iadd:
3088 return ctx->i16_0;
3089 case nir_op_fadd:
3090 return ctx->f16_0;
3091 case nir_op_imul:
3092 return ctx->i16_1;
3093 case nir_op_fmul:
3094 return ctx->f16_1;
3095 case nir_op_imin:
3096 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3097 case nir_op_umin:
3098 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3099 case nir_op_fmin:
3100 return LLVMConstReal(ctx->f16, INFINITY);
3101 case nir_op_imax:
3102 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3103 case nir_op_umax:
3104 return ctx->i16_0;
3105 case nir_op_fmax:
3106 return LLVMConstReal(ctx->f16, -INFINITY);
3107 case nir_op_iand:
3108 return LLVMConstInt(ctx->i16, -1, 0);
3109 case nir_op_ior:
3110 return ctx->i16_0;
3111 case nir_op_ixor:
3112 return ctx->i16_0;
3113 default:
3114 unreachable("bad reduction intrinsic");
3115 }
3116 } else if (type_size == 4) {
3117 switch (op) {
3118 case nir_op_iadd:
3119 return ctx->i32_0;
3120 case nir_op_fadd:
3121 return ctx->f32_0;
3122 case nir_op_imul:
3123 return ctx->i32_1;
3124 case nir_op_fmul:
3125 return ctx->f32_1;
3126 case nir_op_imin:
3127 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3128 case nir_op_umin:
3129 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3130 case nir_op_fmin:
3131 return LLVMConstReal(ctx->f32, INFINITY);
3132 case nir_op_imax:
3133 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3134 case nir_op_umax:
3135 return ctx->i32_0;
3136 case nir_op_fmax:
3137 return LLVMConstReal(ctx->f32, -INFINITY);
3138 case nir_op_iand:
3139 return LLVMConstInt(ctx->i32, -1, 0);
3140 case nir_op_ior:
3141 return ctx->i32_0;
3142 case nir_op_ixor:
3143 return ctx->i32_0;
3144 default:
3145 unreachable("bad reduction intrinsic");
3146 }
3147 } else { /* type_size == 64bit */
3148 switch (op) {
3149 case nir_op_iadd:
3150 return ctx->i64_0;
3151 case nir_op_fadd:
3152 return ctx->f64_0;
3153 case nir_op_imul:
3154 return ctx->i64_1;
3155 case nir_op_fmul:
3156 return ctx->f64_1;
3157 case nir_op_imin:
3158 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3159 case nir_op_umin:
3160 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3161 case nir_op_fmin:
3162 return LLVMConstReal(ctx->f64, INFINITY);
3163 case nir_op_imax:
3164 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3165 case nir_op_umax:
3166 return ctx->i64_0;
3167 case nir_op_fmax:
3168 return LLVMConstReal(ctx->f64, -INFINITY);
3169 case nir_op_iand:
3170 return LLVMConstInt(ctx->i64, -1, 0);
3171 case nir_op_ior:
3172 return ctx->i64_0;
3173 case nir_op_ixor:
3174 return ctx->i64_0;
3175 default:
3176 unreachable("bad reduction intrinsic");
3177 }
3178 }
3179 }
3180
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3181 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3182 nir_op op)
3183 {
3184 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3185 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3186 switch (op) {
3187 case nir_op_iadd:
3188 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3189 case nir_op_fadd:
3190 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3191 case nir_op_imul:
3192 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3193 case nir_op_fmul:
3194 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3195 case nir_op_imin:
3196 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3197 lhs, rhs, "");
3198 case nir_op_umin:
3199 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3200 lhs, rhs, "");
3201 case nir_op_fmin:
3202 return ac_build_intrinsic(
3203 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3204 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3205 case nir_op_imax:
3206 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3207 lhs, rhs, "");
3208 case nir_op_umax:
3209 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3210 lhs, rhs, "");
3211 case nir_op_fmax:
3212 return ac_build_intrinsic(
3213 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3214 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3215 case nir_op_iand:
3216 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3217 case nir_op_ior:
3218 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3219 case nir_op_ixor:
3220 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3221 default:
3222 unreachable("bad reduction intrinsic");
3223 }
3224 }
3225
3226 /**
3227 * \param src The value to shift.
3228 * \param identity The value to use the first lane.
3229 * \param maxprefix specifies that the result only needs to be correct for a
3230 * prefix of this many threads
3231 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3232 */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3233 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3234 LLVMValueRef identity, unsigned maxprefix)
3235 {
3236 if (ctx->gfx_level >= GFX10) {
3237 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3238 LLVMValueRef active, tmp1, tmp2;
3239 LLVMValueRef tid = ac_get_thread_id(ctx);
3240
3241 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3242
3243 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3244
3245 if (maxprefix > 32) {
3246 active =
3247 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3248
3249 tmp2 = LLVMBuildSelect(ctx->builder, active,
3250 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3251 tmp2, "");
3252
3253 active = LLVMBuildOr(
3254 ctx->builder, active,
3255 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3256 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3257 LLVMConstInt(ctx->i32, 0x10, false), ""),
3258 "");
3259 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3260 } else if (maxprefix > 16) {
3261 active =
3262 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3263
3264 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3265 }
3266 } else if (ctx->gfx_level >= GFX8) {
3267 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3268 }
3269
3270 /* wavefront shift_right by 1 on SI/CI */
3271 LLVMValueRef active, tmp1, tmp2;
3272 LLVMValueRef tid = ac_get_thread_id(ctx);
3273 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3274 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3275 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3276 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3277 LLVMConstInt(ctx->i32, 0x4, 0), "");
3278 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3279 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3280 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3281 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3282 LLVMConstInt(ctx->i32, 0x8, 0), "");
3283 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3284 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3285 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3286 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3287 LLVMConstInt(ctx->i32, 0x10, 0), "");
3288 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3289 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3290 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3291 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3292 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3293 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3294 }
3295
3296 /**
3297 * \param maxprefix specifies that the result only needs to be correct for a
3298 * prefix of this many threads
3299 */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3300 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3301 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3302 {
3303 LLVMValueRef result, tmp;
3304
3305 if (!inclusive)
3306 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3307
3308 result = src;
3309
3310 if (ctx->gfx_level <= GFX7) {
3311 assert(maxprefix == 64);
3312 LLVMValueRef tid = ac_get_thread_id(ctx);
3313 LLVMValueRef active;
3314 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3315 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3316 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3317 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3318 result = ac_build_alu_op(ctx, result, tmp, op);
3319 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3320 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3321 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3322 ctx->i32_0, "");
3323 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3324 result = ac_build_alu_op(ctx, result, tmp, op);
3325 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3326 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3327 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3328 ctx->i32_0, "");
3329 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3330 result = ac_build_alu_op(ctx, result, tmp, op);
3331 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3332 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3333 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3334 ctx->i32_0, "");
3335 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3336 result = ac_build_alu_op(ctx, result, tmp, op);
3337 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3338 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3339 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3340 ctx->i32_0, "");
3341 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3342 result = ac_build_alu_op(ctx, result, tmp, op);
3343 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3344 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3345 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3346 ctx->i32_0, "");
3347 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3348 result = ac_build_alu_op(ctx, result, tmp, op);
3349 return result;
3350 }
3351
3352 if (maxprefix <= 1)
3353 return result;
3354 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3355 result = ac_build_alu_op(ctx, result, tmp, op);
3356 if (maxprefix <= 2)
3357 return result;
3358 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3359 result = ac_build_alu_op(ctx, result, tmp, op);
3360 if (maxprefix <= 3)
3361 return result;
3362 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3363 result = ac_build_alu_op(ctx, result, tmp, op);
3364 if (maxprefix <= 4)
3365 return result;
3366 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3367 result = ac_build_alu_op(ctx, result, tmp, op);
3368 if (maxprefix <= 8)
3369 return result;
3370 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3371 result = ac_build_alu_op(ctx, result, tmp, op);
3372 if (maxprefix <= 16)
3373 return result;
3374
3375 if (ctx->gfx_level >= GFX10) {
3376 LLVMValueRef tid = ac_get_thread_id(ctx);
3377 LLVMValueRef active;
3378
3379 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3380
3381 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3382 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3383 ctx->i32_0, "");
3384
3385 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3386
3387 result = ac_build_alu_op(ctx, result, tmp, op);
3388
3389 if (maxprefix <= 32)
3390 return result;
3391
3392 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3393
3394 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3395
3396 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3397
3398 result = ac_build_alu_op(ctx, result, tmp, op);
3399 return result;
3400 }
3401
3402 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3403 result = ac_build_alu_op(ctx, result, tmp, op);
3404 if (maxprefix <= 32)
3405 return result;
3406 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3407 result = ac_build_alu_op(ctx, result, tmp, op);
3408 return result;
3409 }
3410
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3411 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3412 {
3413 LLVMValueRef result;
3414
3415 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3416 LLVMBuilderRef builder = ctx->builder;
3417 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3418 result = ac_build_ballot(ctx, src);
3419 result = ac_build_mbcnt(ctx, result);
3420 result = LLVMBuildAdd(builder, result, src, "");
3421 return result;
3422 }
3423
3424 ac_build_optimization_barrier(ctx, &src, false);
3425
3426 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3427 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3428 LLVMTypeOf(identity), "");
3429 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3430
3431 return ac_build_wwm(ctx, result);
3432 }
3433
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3434 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3435 {
3436 LLVMValueRef result;
3437
3438 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3439 LLVMBuilderRef builder = ctx->builder;
3440 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3441 result = ac_build_ballot(ctx, src);
3442 result = ac_build_mbcnt(ctx, result);
3443 return result;
3444 }
3445
3446 ac_build_optimization_barrier(ctx, &src, false);
3447
3448 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3449 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3450 LLVMTypeOf(identity), "");
3451 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3452
3453 return ac_build_wwm(ctx, result);
3454 }
3455
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3456 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3457 unsigned cluster_size)
3458 {
3459 if (cluster_size == 1)
3460 return src;
3461 ac_build_optimization_barrier(ctx, &src, false);
3462 LLVMValueRef result, swap;
3463 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3464 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3465 LLVMTypeOf(identity), "");
3466 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3467 result = ac_build_alu_op(ctx, result, swap, op);
3468 if (cluster_size == 2)
3469 return ac_build_wwm(ctx, result);
3470
3471 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3472 result = ac_build_alu_op(ctx, result, swap, op);
3473 if (cluster_size == 4)
3474 return ac_build_wwm(ctx, result);
3475
3476 if (ctx->gfx_level >= GFX8)
3477 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3478 else
3479 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3480 result = ac_build_alu_op(ctx, result, swap, op);
3481 if (cluster_size == 8)
3482 return ac_build_wwm(ctx, result);
3483
3484 if (ctx->gfx_level >= GFX8)
3485 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3486 else
3487 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3488 result = ac_build_alu_op(ctx, result, swap, op);
3489 if (cluster_size == 16)
3490 return ac_build_wwm(ctx, result);
3491
3492 if (ctx->gfx_level >= GFX10)
3493 swap = ac_build_permlane16(ctx, result, 0, true, false);
3494 else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3495 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3496 else
3497 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3498 result = ac_build_alu_op(ctx, result, swap, op);
3499 if (cluster_size == 32)
3500 return ac_build_wwm(ctx, result);
3501
3502 if (ctx->gfx_level >= GFX8) {
3503 if (ctx->wave_size == 64) {
3504 if (ctx->gfx_level >= GFX10)
3505 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3506 else
3507 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3508 result = ac_build_alu_op(ctx, result, swap, op);
3509 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3510 }
3511
3512 return ac_build_wwm(ctx, result);
3513 } else {
3514 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3515 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3516 result = ac_build_alu_op(ctx, result, swap, op);
3517 return ac_build_wwm(ctx, result);
3518 }
3519 }
3520
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3521 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3522 LLVMValueRef *arg0, LLVMValueRef *arg1)
3523 {
3524 LLVMValueRef tid;
3525 LLVMValueRef src0, src1;
3526 LLVMValueRef tmp0;
3527 LLVMValueRef params[2];
3528 LLVMValueRef is_even;
3529
3530 src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3531 src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3532
3533 /* swap odd,even lanes of arg_0*/
3534 params[0] = src0;
3535 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3536 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3537 ctx->i32, params, 2, 0);
3538
3539 /* swap even lanes between arg_0 and arg_1 */
3540 tid = ac_get_thread_id(ctx);
3541 is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3542 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3543 ctx->i32_0, "");
3544 tmp0 = src0;
3545 src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3546 src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3547
3548 /* swap odd,even lanes again for arg_0*/
3549 params[0] = src0;
3550 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3551 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3552 ctx->i32, params, 2, 0);
3553
3554 *arg0 = src0;
3555 *arg1 = src1;
3556 }
3557
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3558 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3559 struct ac_export_args *mrt0,
3560 struct ac_export_args *mrt1)
3561 {
3562 assert(ctx->gfx_level >= GFX11);
3563 assert(mrt0->enabled_channels == mrt1->enabled_channels);
3564
3565 for (int i = 0; i < 4; i++) {
3566 if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3567 _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3568 }
3569 }
3570
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3571 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3572 unsigned lane1, unsigned lane2, unsigned lane3)
3573 {
3574 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3575 if (ctx->gfx_level >= GFX8) {
3576 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3577 } else {
3578 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3579 }
3580 }
3581
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3582 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3583 {
3584 LLVMTypeRef type = LLVMTypeOf(src);
3585 LLVMValueRef result;
3586
3587 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3588 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3589
3590 result =
3591 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3592 return LLVMBuildTrunc(ctx->builder, result, type, "");
3593 }
3594
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3595 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3596 {
3597 LLVMTypeRef type;
3598 char *intr;
3599
3600 if (bitsize == 16) {
3601 intr = "llvm.amdgcn.frexp.exp.i16.f16";
3602 type = ctx->i16;
3603 } else if (bitsize == 32) {
3604 intr = "llvm.amdgcn.frexp.exp.i32.f32";
3605 type = ctx->i32;
3606 } else {
3607 intr = "llvm.amdgcn.frexp.exp.i32.f64";
3608 type = ctx->i32;
3609 }
3610
3611 LLVMValueRef params[] = {
3612 src0,
3613 };
3614 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3615 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3616 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3617 {
3618 LLVMTypeRef type;
3619 char *intr;
3620
3621 if (bitsize == 16) {
3622 intr = "llvm.amdgcn.frexp.mant.f16";
3623 type = ctx->f16;
3624 } else if (bitsize == 32) {
3625 intr = "llvm.amdgcn.frexp.mant.f32";
3626 type = ctx->f32;
3627 } else {
3628 intr = "llvm.amdgcn.frexp.mant.f64";
3629 type = ctx->f64;
3630 }
3631
3632 LLVMValueRef params[] = {
3633 src0,
3634 };
3635 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3636 }
3637
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3638 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3639 {
3640 LLVMTypeRef type;
3641 char *intr;
3642
3643 if (bitsize == 16) {
3644 intr = "llvm.canonicalize.f16";
3645 type = ctx->f16;
3646 } else if (bitsize == 32) {
3647 intr = "llvm.canonicalize.f32";
3648 type = ctx->f32;
3649 } else {
3650 intr = "llvm.canonicalize.f64";
3651 type = ctx->f64;
3652 }
3653
3654 LLVMValueRef params[] = {
3655 src0,
3656 };
3657 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3658 }
3659
3660 /*
3661 * this takes an I,J coordinate pair,
3662 * and works out the X and Y derivatives.
3663 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3664 */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)3665 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
3666 {
3667 LLVMValueRef result[4], a;
3668 unsigned i;
3669
3670 for (i = 0; i < 2; i++) {
3671 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
3672 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
3673 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
3674 }
3675 return ac_build_gather_values(ctx, result, 4);
3676 }
3677
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3678 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3679 {
3680 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3681
3682 return LLVMBuildNot(ctx->builder, result, "");
3683 }
3684
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3685 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3686 unsigned num_args)
3687 {
3688 LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3689 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3690 return ret;
3691 }
3692
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3693 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3694 LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3695 struct ac_export_args *args)
3696 {
3697 unsigned mask = 0;
3698 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3699 mrt0_alpha != NULL);
3700
3701 assert(depth || stencil || samplemask);
3702
3703 memset(args, 0, sizeof(*args));
3704
3705 if (is_last) {
3706 args->valid_mask = 1; /* whether the EXEC mask is valid */
3707 args->done = 1; /* DONE bit */
3708 }
3709
3710 /* Specify the target we are exporting */
3711 args->target = V_008DFC_SQ_EXP_MRTZ;
3712
3713 args->compr = 0; /* COMP flag */
3714 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3715 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3716 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3717 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3718
3719 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3720 assert(!depth);
3721 args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3722
3723 if (stencil) {
3724 /* Stencil should be in X[23:16]. */
3725 stencil = ac_to_integer(ctx, stencil);
3726 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3727 args->out[0] = ac_to_float(ctx, stencil);
3728 mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3729 }
3730 if (samplemask) {
3731 /* SampleMask should be in Y[15:0]. */
3732 args->out[1] = samplemask;
3733 mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3734 }
3735 } else {
3736 if (depth) {
3737 args->out[0] = depth;
3738 mask |= 0x1;
3739 }
3740 if (stencil) {
3741 args->out[1] = stencil;
3742 mask |= 0x2;
3743 }
3744 if (samplemask) {
3745 args->out[2] = samplemask;
3746 mask |= 0x4;
3747 }
3748 if (mrt0_alpha) {
3749 args->out[3] = mrt0_alpha;
3750 mask |= 0x8;
3751 }
3752 }
3753
3754 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3755 * at the X writemask component. */
3756 if (ctx->gfx_level == GFX6 &&
3757 ctx->info->family != CHIP_OLAND &&
3758 ctx->info->family != CHIP_HAINAN)
3759 mask |= 0x1;
3760
3761 /* Specify which components to enable */
3762 args->enabled_channels = mask;
3763 }
3764
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3765 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3766 {
3767 LLVMTypeRef base;
3768 switch (type) {
3769 case AC_ARG_FLOAT:
3770 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3771 case AC_ARG_INT:
3772 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3773 case AC_ARG_CONST_PTR:
3774 base = ctx->i8;
3775 break;
3776 case AC_ARG_CONST_FLOAT_PTR:
3777 base = ctx->f32;
3778 break;
3779 case AC_ARG_CONST_PTR_PTR:
3780 base = ac_array_in_const32_addr_space(ctx->i8);
3781 break;
3782 case AC_ARG_CONST_DESC_PTR:
3783 base = ctx->v4i32;
3784 break;
3785 case AC_ARG_CONST_IMAGE_PTR:
3786 base = ctx->v8i32;
3787 break;
3788 default:
3789 assert(false);
3790 return NULL;
3791 }
3792
3793 assert(base);
3794 if (size == 1) {
3795 return ac_array_in_const32_addr_space(base);
3796 } else {
3797 assert(size == 2);
3798 return ac_array_in_const_addr_space(base);
3799 }
3800 }
3801
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3802 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3803 enum ac_llvm_calling_convention convention, const char *name,
3804 LLVMTypeRef ret_type, LLVMModuleRef module)
3805 {
3806 LLVMTypeRef arg_types[AC_MAX_ARGS];
3807 enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3808
3809 /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3810 * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3811 */
3812 unsigned arg_count = 0;
3813 for (unsigned i = 0; i < args->arg_count; i++) {
3814 if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3815 ctx->ring_offsets_index = i;
3816 continue;
3817 }
3818 arg_regfiles[arg_count] = args->args[i].file;
3819 arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3820 }
3821
3822 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3823
3824 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3825 LLVMBasicBlockRef main_function_body =
3826 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3827 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3828
3829 LLVMSetFunctionCallConv(main_function, convention);
3830 for (unsigned i = 0; i < arg_count; ++i) {
3831 LLVMValueRef P = LLVMGetParam(main_function, i);
3832
3833 if (arg_regfiles[i] != AC_ARG_SGPR)
3834 continue;
3835
3836 ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3837
3838 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3839 ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3840 ac_add_attr_dereferenceable(P, UINT64_MAX);
3841 ac_add_attr_alignment(P, 4);
3842 }
3843 }
3844
3845 if (args->ring_offsets.used) {
3846 ctx->ring_offsets =
3847 ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3848 LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3849 ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3850 ac_array_in_const_addr_space(ctx->v4i32), "");
3851 }
3852
3853 ctx->main_function = (struct ac_llvm_pointer) {
3854 .value = main_function,
3855 .pointee_type = main_function_type
3856 };
3857
3858 /* Enable denormals for FP16 and FP64: */
3859 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3860 /* Disable denormals for FP32: */
3861 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3862 "preserve-sign,preserve-sign");
3863
3864 if (convention == AC_LLVM_AMDGPU_PS) {
3865 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3866 ctx->exports_mrtz ? "1" : "0");
3867 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3868 ctx->exports_color_null ? "1" : "0");
3869 }
3870
3871 return ctx->main_function;
3872 }
3873
ac_build_s_endpgm(struct ac_llvm_context * ctx)3874 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
3875 {
3876 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3877 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
3878 LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
3879 }
3880
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3881 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3882 {
3883 LLVMValueRef args[2] = {
3884 a,
3885 LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3886 };
3887 return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3888 }
3889