• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21 
22 #include <assert.h>
23 #include <stdio.h>
24 
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26 
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28  */
29 struct ac_llvm_flow {
30    /* Loop exit or next part of if/else/endif. */
31    LLVMBasicBlockRef next_block;
32    LLVMBasicBlockRef loop_entry_block;
33 };
34 
35 /* Initialize module-independent parts of the context.
36  *
37  * The caller is responsible for initializing ctx::module and ctx::builder.
38  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40                           const struct radeon_info *info, enum ac_float_mode float_mode,
41                           unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42                           bool exports_mrtz)
43 {
44    ctx->context = LLVMContextCreate();
45 
46    ctx->info = info;
47    ctx->gfx_level = info->gfx_level;
48    ctx->wave_size = wave_size;
49    ctx->ballot_mask_bits = ballot_mask_bits;
50    ctx->float_mode = float_mode;
51    ctx->exports_color_null = exports_color_null;
52    ctx->exports_mrtz = exports_mrtz;
53    ctx->module = ac_create_module(compiler->tm, ctx->context);
54    ctx->builder = ac_create_builder(ctx->context, float_mode);
55 
56    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63    ctx->intptr = ctx->i32;
64    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67    ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81 
82    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98 
99    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101 
102    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105    ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106 
107    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108 
109    LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110    ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111 
112    ctx->flow = calloc(1, sizeof(*ctx->flow));
113 
114    ctx->ring_offsets_index = INT32_MAX;
115 }
116 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119    free(ctx->flow->stack);
120    free(ctx->flow);
121    ctx->flow = NULL;
122 }
123 
ac_get_llvm_num_components(LLVMValueRef value)124 int ac_get_llvm_num_components(LLVMValueRef value)
125 {
126    LLVMTypeRef type = LLVMTypeOf(value);
127    unsigned num_components =
128       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
129    return num_components;
130 }
131 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)132 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
133 {
134    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
135       assert(index == 0);
136       return value;
137    }
138 
139    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
140 }
141 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)142 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
143 {
144    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
145       type = LLVMGetElementType(type);
146 
147    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
148       return LLVMGetIntTypeWidth(type);
149 
150    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
151       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
152          return 32;
153    }
154 
155    if (type == ctx->f16)
156       return 16;
157    if (type == ctx->f32)
158       return 32;
159    if (type == ctx->f64)
160       return 64;
161 
162    unreachable("Unhandled type kind in get_elem_bits");
163 }
164 
ac_get_type_size(LLVMTypeRef type)165 unsigned ac_get_type_size(LLVMTypeRef type)
166 {
167    LLVMTypeKind kind = LLVMGetTypeKind(type);
168 
169    switch (kind) {
170    case LLVMIntegerTypeKind:
171       return LLVMGetIntTypeWidth(type) / 8;
172    case LLVMHalfTypeKind:
173       return 2;
174    case LLVMFloatTypeKind:
175       return 4;
176    case LLVMDoubleTypeKind:
177       return 8;
178    case LLVMPointerTypeKind:
179       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
180          return 4;
181       return 8;
182    case LLVMVectorTypeKind:
183       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
184    case LLVMArrayTypeKind:
185       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
186    default:
187       assert(0);
188       return 0;
189    }
190 }
191 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)192 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
193 {
194    if (t == ctx->i1)
195       return ctx->i1;
196    else if (t == ctx->i8)
197       return ctx->i8;
198    else if (t == ctx->f16 || t == ctx->i16)
199       return ctx->i16;
200    else if (t == ctx->f32 || t == ctx->i32)
201       return ctx->i32;
202    else if (t == ctx->f64 || t == ctx->i64)
203       return ctx->i64;
204    else
205       unreachable("Unhandled integer size");
206 }
207 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)208 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
209 {
210    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
211       LLVMTypeRef elem_type = LLVMGetElementType(t);
212       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
213    }
214    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
215       switch (LLVMGetPointerAddressSpace(t)) {
216       case AC_ADDR_SPACE_GLOBAL:
217       case AC_ADDR_SPACE_CONST:
218          return ctx->i64;
219       case AC_ADDR_SPACE_CONST_32BIT:
220       case AC_ADDR_SPACE_LDS:
221          return ctx->i32;
222       default:
223          unreachable("unhandled address space");
224       }
225    }
226    return to_integer_type_scalar(ctx, t);
227 }
228 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)229 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
230 {
231    LLVMTypeRef type = LLVMTypeOf(v);
232    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
233       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
234    }
235    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236 }
237 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)238 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
239 {
240    LLVMTypeRef type = LLVMTypeOf(v);
241    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
242       return v;
243    return ac_to_integer(ctx, v);
244 }
245 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)246 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
247 {
248    if (t == ctx->i8)
249       return ctx->i8;
250    else if (t == ctx->i16 || t == ctx->f16)
251       return ctx->f16;
252    else if (t == ctx->i32 || t == ctx->f32)
253       return ctx->f32;
254    else if (t == ctx->i64 || t == ctx->f64)
255       return ctx->f64;
256    else
257       unreachable("Unhandled float size");
258 }
259 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)260 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
261 {
262    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
263       LLVMTypeRef elem_type = LLVMGetElementType(t);
264       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
265    }
266    return to_float_type_scalar(ctx, t);
267 }
268 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)269 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
270 {
271    LLVMTypeRef type = LLVMTypeOf(v);
272    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
273 }
274 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)275 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
276                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
277                                 unsigned attrib_mask)
278 {
279    LLVMValueRef call;
280 
281    LLVMTypeRef param_types[32];
282    assert(param_count <= 32);
283    for (unsigned i = 0; i < param_count; ++i) {
284       assert(params[i]);
285       param_types[i] = LLVMTypeOf(params[i]);
286    }
287 
288    LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
289    LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
290 
291    if (!function) {
292       function = LLVMAddFunction(ctx->module, name, function_type);
293 
294       LLVMSetFunctionCallConv(function, LLVMCCallConv);
295       LLVMSetLinkage(function, LLVMExternalLinkage);
296    }
297 
298    call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
299 
300    if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
301       LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
302 
303    if (attrib_mask & AC_ATTR_CONVERGENT)
304       LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
305 
306    LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
307    return call;
308 }
309 
310 /**
311  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
312  * intrinsic names).
313  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)314 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
315 {
316    LLVMTypeRef elem_type = type;
317 
318    if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
319       unsigned count = LLVMCountStructElementTypes(type);
320       int ret = snprintf(buf, bufsize, "sl_");
321       buf += ret;
322       bufsize -= ret;
323 
324       LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
325       LLVMGetStructElementTypes(type, elems);
326 
327       for (unsigned i = 0; i < count; i++) {
328          ac_build_type_name_for_intr(elems[i], buf, bufsize);
329          ret = strlen(buf);
330          buf += ret;
331          bufsize -= ret;
332       }
333 
334       snprintf(buf, bufsize, "s");
335       return;
336    }
337 
338    assert(bufsize >= 8);
339    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
340       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
341       if (ret < 0) {
342          char *type_name = LLVMPrintTypeToString(type);
343          fprintf(stderr, "Error building type name for: %s\n", type_name);
344          LLVMDisposeMessage(type_name);
345          return;
346       }
347       elem_type = LLVMGetElementType(type);
348       buf += ret;
349       bufsize -= ret;
350    }
351    switch (LLVMGetTypeKind(elem_type)) {
352    default:
353       break;
354    case LLVMIntegerTypeKind:
355       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
356       break;
357    case LLVMHalfTypeKind:
358       snprintf(buf, bufsize, "f16");
359       break;
360    case LLVMFloatTypeKind:
361       snprintf(buf, bufsize, "f32");
362       break;
363    case LLVMDoubleTypeKind:
364       snprintf(buf, bufsize, "f64");
365       break;
366    }
367 }
368 
369 /**
370  * Helper function that builds an LLVM IR PHI node and immediately adds
371  * incoming edges.
372  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)373 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
374                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
375 {
376    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
377    LLVMAddIncoming(phi, values, blocks, count_incoming);
378    return phi;
379 }
380 
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)381 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
382 {
383    /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
384     * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
385     */
386    if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
387       return;
388 
389    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
390 }
391 
392 /* Prevent optimizations (at least of memory accesses) across the current
393  * point in the program by emitting empty inline assembly that is marked as
394  * having side effects.
395  *
396  * Optionally, a value can be passed through the inline assembly to prevent
397  * LLVM from hoisting calls to ReadNone functions.
398  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)399 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
400 {
401    static int counter = 0;
402 
403    LLVMBuilderRef builder = ctx->builder;
404    char code[16];
405    const char *constraint = sgpr ? "=s,0" : "=v,0";
406 
407    snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
408 
409    if (!pgpr) {
410       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
411       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
412       LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
413    } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
414       /* Simple version for i32 that allows the caller to set LLVM metadata on the call
415        * instruction. */
416       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
417       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
418 
419       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
420    } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
421       /* Simple version for i16 that allows the caller to set LLVM metadata on the call
422        * instruction. */
423       LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
424       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
425 
426       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
427    } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
428       LLVMTypeRef type = LLVMTypeOf(*pgpr);
429       LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
430       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
431 
432       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
433    } else {
434       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
435       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
436       LLVMTypeRef type = LLVMTypeOf(*pgpr);
437       unsigned bitsize = ac_get_elem_bits(ctx, type);
438       LLVMValueRef vgpr = *pgpr;
439       LLVMTypeRef vgpr_type;
440       unsigned vgpr_size;
441       LLVMValueRef vgpr0;
442 
443       if (bitsize < 32)
444          vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
445 
446       vgpr_type = LLVMTypeOf(vgpr);
447       vgpr_size = ac_get_type_size(vgpr_type);
448 
449       assert(vgpr_size % 4 == 0);
450 
451       vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
452       vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
453       vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, "");
454       vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
455       vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
456 
457       if (bitsize < 32)
458          vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
459 
460       *pgpr = vgpr;
461    }
462 }
463 
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)464 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
465 {
466    if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
467       const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
468       LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
469       LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
470       return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
471    }
472 
473    const char *subgroup = "llvm.readcyclecounter";
474    const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
475 
476    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
477    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
478 }
479 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)480 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
481 {
482    const char *name;
483 
484    if (LLVMTypeOf(value) == ctx->i1)
485       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
486 
487    if (ctx->wave_size == 64)
488       name = "llvm.amdgcn.icmp.i64.i32";
489    else
490       name = "llvm.amdgcn.icmp.i32.i32";
491 
492    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
493 
494    /* We currently have no other way to prevent LLVM from lifting the icmp
495     * calls to a dominating basic block.
496     */
497    ac_build_optimization_barrier(ctx, &args[0], false);
498 
499    args[0] = ac_to_integer(ctx, args[0]);
500 
501    return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
502 }
503 
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)504 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
505 {
506    const char *name;
507 
508    if (ctx->wave_size == 64)
509       name = "llvm.amdgcn.icmp.i64.i1";
510    else
511       name = "llvm.amdgcn.icmp.i32.i1";
512 
513    LLVMValueRef args[3] = {
514       value,
515       ctx->i1false,
516       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
517    };
518 
519    return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
520 }
521 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)522 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
523 {
524    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
525    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
526    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
527 }
528 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)529 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
530 {
531    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
532    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
533                         "");
534 }
535 
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)536 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
537 {
538    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
539    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
540 
541    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
542    LLVMValueRef none =
543       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
544    return LLVMBuildOr(ctx->builder, all, none, "");
545 }
546 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)547 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
548                                             unsigned value_count, unsigned component)
549 {
550    LLVMValueRef vec = NULL;
551 
552    if (value_count == 1) {
553       return values[component];
554    } else if (!value_count)
555       unreachable("value_count is 0");
556 
557    for (unsigned i = component; i < value_count + component; i++) {
558       LLVMValueRef value = values[i];
559 
560       if (i == component)
561          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
562       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
563       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
564    }
565    return vec;
566 }
567 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)568 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
569                                              unsigned value_count, unsigned value_stride,
570                                              bool always_vector)
571 {
572    LLVMBuilderRef builder = ctx->builder;
573    LLVMValueRef vec = NULL;
574    unsigned i;
575 
576    if (value_count == 1 && !always_vector) {
577       return values[0];
578    } else if (!value_count)
579       unreachable("value_count is 0");
580 
581    for (i = 0; i < value_count; i++) {
582       LLVMValueRef value = values[i * value_stride];
583 
584       if (!i)
585          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
586       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
587       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
588    }
589    return vec;
590 }
591 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)592 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
593                                     unsigned value_count)
594 {
595    return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
596 }
597 
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)598 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
599 {
600    if (!a)
601       return b;
602 
603    unsigned a_size = ac_get_llvm_num_components(a);
604    unsigned b_size = ac_get_llvm_num_components(b);
605 
606    LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
607    for (unsigned i = 0; i < a_size; i++)
608       elems[i] = ac_llvm_extract_elem(ctx, a, i);
609    for (unsigned i = 0; i < b_size; i++)
610       elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
611 
612    return ac_build_gather_values(ctx, elems, a_size + b_size);
613 }
614 
615 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
616  * channels with undef. Extract at most src_channels components from the input.
617  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)618 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
619                              unsigned src_channels, unsigned dst_channels)
620 {
621    LLVMTypeRef elemtype;
622    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
623 
624    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
625       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
626 
627       if (src_channels == dst_channels && vec_size == dst_channels)
628          return value;
629 
630       src_channels = MIN2(src_channels, vec_size);
631 
632       for (unsigned i = 0; i < src_channels; i++)
633          chan[i] = ac_llvm_extract_elem(ctx, value, i);
634 
635       elemtype = LLVMGetElementType(LLVMTypeOf(value));
636    } else {
637       if (src_channels) {
638          assert(src_channels == 1);
639          chan[0] = value;
640       }
641       elemtype = LLVMTypeOf(value);
642    }
643 
644    for (unsigned i = src_channels; i < dst_channels; i++)
645       chan[i] = LLVMGetUndef(elemtype);
646 
647    return ac_build_gather_values(ctx, chan, dst_channels);
648 }
649 
650 /* Extract components [start, start + channels) from a vector.
651  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)652 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
653                                    unsigned channels)
654 {
655    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
656 
657    for (unsigned i = 0; i < channels; i++)
658       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
659 
660    return ac_build_gather_values(ctx, chan, channels);
661 }
662 
663 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
664  * with undef. Extract at most num_channels components from the input.
665  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)666 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
667                                      unsigned num_channels)
668 {
669    return ac_build_expand(ctx, value, num_channels, 4);
670 }
671 
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)672 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
673 {
674    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
675    const char *name;
676 
677    if (type_size == 2)
678       name = "llvm.rint.f16";
679    else if (type_size == 4)
680       name = "llvm.rint.f32";
681    else
682       name = "llvm.rint.f64";
683 
684    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, 0);
685 }
686 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)687 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
688 {
689    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
690    const char *name;
691 
692    if (type_size == 2)
693       name = "llvm.amdgcn.rcp.f16";
694    else if (type_size == 4)
695       name = "llvm.amdgcn.rcp.f32";
696    else
697       name = "llvm.amdgcn.rcp.f64";
698 
699    LLVMValueRef rcp =
700       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
701 
702    return LLVMBuildFMul(ctx->builder, num, rcp, "");
703 }
704 
705 /* See fast_idiv_by_const.h. */
706 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)707 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
708                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
709                                 LLVMValueRef post_shift, LLVMValueRef increment)
710 {
711    LLVMBuilderRef builder = ctx->builder;
712 
713    num = LLVMBuildLShr(builder, num, pre_shift, "");
714    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
715                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
716    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
717    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
718    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
719    return LLVMBuildLShr(builder, num, post_shift, "");
720 }
721 
722 /* See fast_idiv_by_const.h. */
723 /* If num != UINT_MAX, this more efficient version can be used. */
724 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)725 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
726                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
727                                     LLVMValueRef post_shift, LLVMValueRef increment)
728 {
729    LLVMBuilderRef builder = ctx->builder;
730 
731    num = LLVMBuildLShr(builder, num, pre_shift, "");
732    num = LLVMBuildNUWAdd(builder, num, increment, "");
733    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
734                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
735    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
736    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
737    return LLVMBuildLShr(builder, num, post_shift, "");
738 }
739 
740 /* See fast_idiv_by_const.h. */
741 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)742 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
743                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
744 {
745    LLVMBuilderRef builder = ctx->builder;
746 
747    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
748                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
749    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
750    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
751    return LLVMBuildLShr(builder, num, post_shift, "");
752 }
753 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)754 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
755                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
756                                 LLVMValueRef j)
757 {
758    LLVMValueRef args[5];
759 
760    if (ctx->gfx_level >= GFX11) {
761       LLVMValueRef p;
762       LLVMValueRef p10;
763 
764       args[0] = llvm_chan;
765       args[1] = attr_number;
766       args[2] = params;
767 
768       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
769                              ctx->f32, args, 3, 0);
770 
771       args[0] = p;
772       args[1] = i;
773       args[2] = p;
774 
775       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
776                                ctx->f32, args, 3, 0);
777 
778       args[0] = p;
779       args[1] = j;
780       args[2] = p10;
781 
782       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
783                                 ctx->f32, args, 3, 0);
784 
785    } else {
786       LLVMValueRef p1;
787 
788       args[0] = i;
789       args[1] = llvm_chan;
790       args[2] = attr_number;
791       args[3] = params;
792 
793       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
794                               ctx->f32, args, 4, 0);
795 
796       args[0] = p1;
797       args[1] = j;
798       args[2] = llvm_chan;
799       args[3] = attr_number;
800       args[4] = params;
801 
802       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
803                                 ctx->f32, args, 5, 0);
804    }
805 }
806 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)807 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
808                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
809                                     LLVMValueRef j, bool high_16bits)
810 {
811    LLVMValueRef args[6];
812 
813    if (ctx->gfx_level >= GFX11) {
814       LLVMValueRef p;
815       LLVMValueRef p10;
816 
817       args[0] = llvm_chan;
818       args[1] = attr_number;
819       args[2] = params;
820 
821       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
822                              ctx->f32, args, 3, 0);
823 
824       args[0] = p;
825       args[1] = i;
826       args[2] = p;
827       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
828 
829       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
830                                ctx->f32, args, 4, 0);
831 
832       args[0] = p;
833       args[1] = j;
834       args[2] = p10;
835       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
836 
837       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
838                                 ctx->f16, args, 4, 0);
839 
840    } else {
841       LLVMValueRef p1;
842 
843       args[0] = i;
844       args[1] = llvm_chan;
845       args[2] = attr_number;
846       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
847       args[4] = params;
848 
849       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
850                               0);
851 
852       args[0] = p1;
853       args[1] = j;
854       args[2] = llvm_chan;
855       args[3] = attr_number;
856       args[4] = high_16bits ? ctx->i1true : ctx->i1false;
857       args[5] = params;
858 
859       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
860                                 0);
861    }
862 }
863 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)864 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
865                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
866                                     LLVMValueRef params)
867 {
868    LLVMValueRef args[4];
869 
870    if (ctx->gfx_level >= GFX11) {
871       LLVMValueRef p;
872 
873       args[0] = llvm_chan;
874       args[1] = attr_number;
875       args[2] = params;
876 
877       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
878                              ctx->f32, args, 3, 0);
879       p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
880       p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter);
881       return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
882    } else {
883       args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
884       args[1] = llvm_chan;
885       args[2] = attr_number;
886       args[3] = params;
887 
888       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
889    }
890 }
891 
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index)892 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef base_ptr,
893                               LLVMValueRef index)
894 {
895    return LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
896 }
897 
ac_build_gep0_type(LLVMTypeRef pointee_type,LLVMValueRef index)898 LLVMTypeRef ac_build_gep0_type(LLVMTypeRef pointee_type, LLVMValueRef index)
899 {
900    switch (LLVMGetTypeKind(pointee_type)) {
901       case LLVMPointerTypeKind:
902          return pointee_type;
903       case LLVMArrayTypeKind:
904          /* If input is a pointer to an array GEP2 will return a pointer to
905           * the array elements type.
906           */
907          return LLVMGetElementType(pointee_type);
908       case LLVMStructTypeKind:
909          /* If input is a pointer to a struct, GEP2 will return a pointer to
910           * the index-nth field, so get its type.
911           */
912          return LLVMStructGetTypeAtIndex(pointee_type, LLVMConstIntGetZExtValue(index));
913       default:
914          /* gep0 shouldn't receive any other types. */
915          assert(false);
916    }
917    return NULL;
918 }
919 
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)920 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
921 {
922    LLVMValueRef indices[2] = {
923       ctx->i32_0,
924       index,
925    };
926 
927    return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
928 }
929 
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef ptr,LLVMValueRef index)930 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef ptr, LLVMValueRef index)
931 {
932    return LLVMBuildGEP2(ctx->builder, type, ptr, &index, 1, "");
933 }
934 
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)935 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
936                             LLVMValueRef value)
937 {
938    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
939 }
940 
941 /**
942  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
943  * It's equivalent to doing a load from &base_ptr[index].
944  *
945  * \param base_ptr  Where the array starts.
946  * \param index     The element index into the array.
947  * \param uniform   Whether the base_ptr and index can be assumed to be
948  *                  dynamically uniform (i.e. load to an SGPR)
949  * \param invariant Whether the load is invariant (no other opcodes affect it)
950  * \param no_unsigned_wraparound
951  *    For all possible re-associations and re-distributions of an expression
952  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
953  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
954  *    does not result in an unsigned integer wraparound. This is used for
955  *    optimal code generation of 32-bit pointer arithmetic.
956  *
957  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
958  *    integer wraparound can't be an imm offset in s_load_dword, because
959  *    the instruction performs "addr + offset" in 64 bits.
960  *
961  *    Expected usage for bindless textures by chaining GEPs:
962  *      // possible unsigned wraparound, don't use InBounds:
963  *      ptr1 = LLVMBuildGEP(base_ptr, index);
964  *      image = load(ptr1); // becomes "s_load ptr1, 0"
965  *
966  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
967  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
968  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)969 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
970                                          LLVMValueRef base_ptr, LLVMValueRef index,
971                                          bool uniform, bool invariant, bool no_unsigned_wraparound)
972 {
973    LLVMValueRef pointer, result;
974 
975    if (no_unsigned_wraparound &&
976        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
977       pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
978    else
979       pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
980 
981    if (uniform)
982       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
983    result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
984    if (invariant)
985       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
986    LLVMSetAlignment(result, 4);
987    return result;
988 }
989 
ac_build_load(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)990 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
991 {
992    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, false, false);
993 }
994 
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)995 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
996                                      LLVMValueRef index)
997 {
998    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
999 }
1000 
1001 /* This assumes that there is no unsigned integer wraparound during the address
1002  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)1003 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
1004                                    LLVMValueRef index)
1005 {
1006    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
1007 }
1008 
1009 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)1010 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
1011 {
1012    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
1013 }
1014 
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)1015 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
1016 {
1017    return ac_get_hw_cache_flags(ctx->info, access).value;
1018 }
1019 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)1020 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1021                                          LLVMValueRef data, LLVMValueRef vindex,
1022                                          LLVMValueRef voffset, LLVMValueRef soffset,
1023                                          enum gl_access_qualifier access, bool use_format)
1024 {
1025    LLVMValueRef args[6];
1026    int idx = 0;
1027    args[idx++] = data;
1028    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1029    if (vindex)
1030       args[idx++] = vindex ? vindex : ctx->i32_0;
1031    args[idx++] = voffset ? voffset : ctx->i32_0;
1032    args[idx++] = soffset ? soffset : ctx->i32_0;
1033    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
1034    const char *indexing_kind = vindex ? "struct" : "raw";
1035    char name[256], type_name[8];
1036 
1037    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1038 
1039    if (use_format) {
1040       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1041                type_name);
1042    } else {
1043       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1044    }
1045 
1046    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
1047 }
1048 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)1049 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1050                                   LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
1051 {
1052    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
1053 }
1054 
1055 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1056 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1057                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1058                                  enum gl_access_qualifier access)
1059 {
1060    unsigned num_channels = ac_get_llvm_num_components(vdata);
1061 
1062    /* Split 3 channel stores if unsupported. */
1063    if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1064       LLVMValueRef v[3], v01, voffset2;
1065 
1066       for (int i = 0; i < 3; i++) {
1067          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1068       }
1069       v01 = ac_build_gather_values(ctx, v, 2);
1070 
1071       voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1072                               LLVMConstInt(ctx->i32, 8, 0), "");
1073 
1074       ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
1075       ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
1076       return;
1077    }
1078 
1079    ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1080                                 access, false);
1081 }
1082 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)1083 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1084                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1085                                                 LLVMValueRef soffset, unsigned num_channels,
1086                                                 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1087                                                 bool can_speculate, bool use_format)
1088 {
1089    LLVMValueRef args[5];
1090    int idx = 0;
1091    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1092    if (vindex)
1093       args[idx++] = vindex;
1094    args[idx++] = voffset ? voffset : ctx->i32_0;
1095    args[idx++] = soffset ? soffset : ctx->i32_0;
1096    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1097    unsigned func =
1098       !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1099    const char *indexing_kind = vindex ? "struct" : "raw";
1100    char name[256], type_name[8];
1101 
1102    /* D16 is only supported on gfx8+ */
1103    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1104           ctx->gfx_level >= GFX8);
1105 
1106    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1107    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1108 
1109    if (use_format) {
1110       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1111                type_name);
1112    } else {
1113       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1114    }
1115 
1116    LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
1117                                             can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1118    if (func > num_channels)
1119       result = ac_trim_vector(ctx, result, num_channels);
1120    return result;
1121 }
1122 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)1123 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1124                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1125                                   LLVMTypeRef channel_type, enum gl_access_qualifier access,
1126                                   bool can_speculate, bool allow_smem)
1127 {
1128    if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
1129       assert(vindex == NULL);
1130 
1131       LLVMValueRef result[32];
1132 
1133       LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1134       if (soffset)
1135          offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1136 
1137       char name[256], type_name[8];
1138       ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
1139       snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
1140 
1141       LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
1142 
1143       for (int i = 0; i < num_channels; i++) {
1144          if (i) {
1145             offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
1146          }
1147          LLVMValueRef args[3] = {
1148             rsrc,
1149             offset,
1150             LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
1151                                                         ACCESS_TYPE_SMEM), 0),
1152          };
1153          result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
1154       }
1155       if (num_channels == 1)
1156          return result[0];
1157 
1158       return ac_build_gather_values(ctx, result, num_channels);
1159    }
1160 
1161    /* LLVM is unable to select instructions for num_channels > 4, so we
1162     * workaround that by manually splitting larger buffer loads.
1163     */
1164    LLVMValueRef result = NULL;
1165    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1166       fetch_num_channels = MIN2(4, num_channels - i);
1167       LLVMValueRef fetch_voffset =
1168             LLVMBuildAdd(ctx->builder, voffset,
1169                          LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1170       LLVMValueRef item =
1171          ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1172                                      channel_type, access, can_speculate, false);
1173       result = ac_build_concat(ctx, result, item);
1174    }
1175 
1176    return result;
1177 }
1178 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1179 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1180                                          LLVMValueRef vindex, LLVMValueRef voffset,
1181                                          unsigned num_channels, enum gl_access_qualifier access,
1182                                          bool can_speculate, bool d16, bool tfe)
1183 {
1184    if (tfe) {
1185       assert(!d16);
1186 
1187       unsigned cache_flags = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD);
1188 
1189       char code[256];
1190       /* The definition in the assembly and the one in the constraint string
1191        * differs because of an assembler bug.
1192        */
1193       snprintf(code, sizeof(code),
1194                "v_mov_b32 v0, 0\n"
1195                "v_mov_b32 v1, 0\n"
1196                "v_mov_b32 v2, 0\n"
1197                "v_mov_b32 v3, 0\n"
1198                "v_mov_b32 v4, 0\n"
1199                "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1200                "s_waitcnt vmcnt(0)",
1201                cache_flags & ac_glc ? "glc" : "",
1202                cache_flags & ac_slc ? "slc" : "",
1203                cache_flags & ac_dlc ? "dlc" : "");
1204 
1205       LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1206       LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1207       LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1208 
1209       LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1210                                    voffset ? voffset : ctx->i32_0};
1211 
1212       LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1213                              LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1214       LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1215 
1216       return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1217                              ac_llvm_extract_elem(ctx, res, 4));
1218    }
1219 
1220    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1221                                       num_channels, d16 ? ctx->f16 : ctx->f32, access,
1222                                       can_speculate, true);
1223 }
1224 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1225 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1226                                           LLVMValueRef vindex, LLVMValueRef voffset,
1227                                           LLVMValueRef soffset, unsigned num_channels,
1228                                           unsigned tbuffer_format, LLVMTypeRef channel_type,
1229                                           enum gl_access_qualifier access, bool can_speculate)
1230 {
1231    LLVMValueRef args[6];
1232    int idx = 0;
1233    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1234    if (vindex)
1235       args[idx++] = vindex;
1236    args[idx++] = voffset ? voffset : ctx->i32_0;
1237    args[idx++] = soffset ? soffset : ctx->i32_0;
1238    args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1239    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1240    const char *indexing_kind = vindex ? "struct" : "raw";
1241    char name[256], type_name[8];
1242 
1243    LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1244    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1245 
1246    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1247 
1248    return ac_build_intrinsic(ctx, name, type, args, idx,
1249                              can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1250 }
1251 
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1252 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1253                                         LLVMValueRef vidx, LLVMValueRef base_voffset,
1254                                         LLVMValueRef soffset,
1255                                         const enum pipe_format format,
1256                                         unsigned channel_bit_size,
1257                                         unsigned const_offset,
1258                                         unsigned align_offset,
1259                                         unsigned align_mul,
1260                                         unsigned num_channels,
1261                                         enum gl_access_qualifier access,
1262                                         bool can_speculate)
1263 {
1264    const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1265    const unsigned max_channels = vtx_info->num_channels;
1266    LLVMValueRef voffset_plus_const =
1267       LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1268 
1269    /* Split the specified load into several MTBUF instructions,
1270     * according to a safe fetch size determined by aligmnent information.
1271     */
1272    LLVMValueRef result = NULL;
1273    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1274       /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1275       assert(i == 0 || vtx_info->chan_byte_size);
1276 
1277       const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1278       const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1279       const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1280 
1281       fetch_num_channels =
1282          ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1283                                 max_channels - i, fetch_alignment, num_channels - i);
1284       const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1285       LLVMValueRef fetch_voffset =
1286             LLVMBuildAdd(ctx->builder, voffset_plus_const,
1287                          LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1288       LLVMValueRef item =
1289          ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1290                                fetch_num_channels, fetch_format, ctx->i32,
1291                                access, can_speculate);
1292       result = ac_build_concat(ctx, result, item);
1293    }
1294 
1295    /*
1296     * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1297     * manually truncate them to the required size.
1298     * TODO: Do this in NIR instead.
1299     */
1300    const struct util_format_description *desc = util_format_description(format);
1301    bool is_float = !desc->channel[0].pure_integer;
1302 
1303    if (channel_bit_size == 16) {
1304       LLVMValueRef channels[4];
1305       for (unsigned i = 0; i < num_channels; i++) {
1306          LLVMValueRef channel = result;
1307          if (num_channels > 1)
1308             channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1309 
1310          if (is_float) {
1311             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1312             channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1313             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1314          } else {
1315             channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1316          }
1317          channels[i] = channel;
1318       }
1319       result = ac_build_gather_values(ctx, channels, num_channels);
1320    }
1321 
1322    return result;
1323 }
1324 
1325 
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1326 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1327                                         LLVMValueRef voffset, LLVMValueRef soffset,
1328                                         enum gl_access_qualifier access)
1329 {
1330    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1331                                       access, false, false);
1332 }
1333 
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1334 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1335                                        LLVMValueRef voffset, LLVMValueRef soffset,
1336                                        enum gl_access_qualifier access)
1337 {
1338    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1339                                       false, false);
1340 }
1341 
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1342 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1343                                  LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1344                                  enum gl_access_qualifier access)
1345 {
1346    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1347 
1348    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1349 }
1350 
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1351 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1352                                 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1353 {
1354    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1355 
1356    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1357 }
1358 
1359 /**
1360  * Set range metadata on an instruction.  This can only be used on load and
1361  * call instructions.  If you know an instruction can only produce the values
1362  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1363  * \p lo is the minimum value inclusive.
1364  * \p hi is the maximum value exclusive.
1365  */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1366 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1367                            unsigned hi)
1368 {
1369    LLVMValueRef range_md, md_args[2];
1370    LLVMTypeRef type = LLVMTypeOf(value);
1371    LLVMContextRef context = LLVMGetTypeContext(type);
1372 
1373    md_args[0] = LLVMConstInt(type, lo, false);
1374    md_args[1] = LLVMConstInt(type, hi, false);
1375    range_md = LLVMMDNodeInContext(context, md_args, 2);
1376    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1377 }
1378 
ac_get_thread_id(struct ac_llvm_context * ctx)1379 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1380 {
1381    return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1382 }
1383 
1384 /*
1385  * AMD GCN implements derivatives using the local data store (LDS)
1386  * All writes to the LDS happen in all executing threads at
1387  * the same time. TID is the Thread ID for the current
1388  * thread and is a value between 0 and 63, representing
1389  * the thread's position in the wavefront.
1390  *
1391  * For the pixel shader threads are grouped into quads of four pixels.
1392  * The TIDs of the pixels of a quad are:
1393  *
1394  *  +------+------+
1395  *  |4n + 0|4n + 1|
1396  *  +------+------+
1397  *  |4n + 2|4n + 3|
1398  *  +------+------+
1399  *
1400  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1401  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1402  * the current pixel's column, and masking with 0xfffffffe yields the TID
1403  * of the left pixel of the current pixel's row.
1404  *
1405  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1406  * adding 2 yields the TID of the pixel below the top pixel.
1407  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1408 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1409 {
1410    unsigned tl_lanes[4], trbl_lanes[4];
1411    char name[32], type[8];
1412    LLVMValueRef tl, trbl;
1413    LLVMTypeRef result_type;
1414    LLVMValueRef result;
1415 
1416    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1417 
1418    if (result_type == ctx->f16)
1419       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1420    else if (result_type == ctx->v2f16)
1421       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1422 
1423    for (unsigned i = 0; i < 4; ++i) {
1424       tl_lanes[i] = i & mask;
1425       trbl_lanes[i] = (i & mask) + idx;
1426    }
1427 
1428    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1429    trbl =
1430       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1431 
1432    if (result_type == ctx->f16) {
1433       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1434       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1435    }
1436 
1437    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1438    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1439    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1440 
1441    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1442    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1443 
1444    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1445 }
1446 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1447 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1448 {
1449    LLVMValueRef args[2];
1450    args[0] = LLVMConstInt(ctx->i32, imm, false);
1451    args[1] = m0_content;
1452    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1453 }
1454 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1455 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1456 {
1457    LLVMValueRef msb =
1458       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1459 
1460    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1461     * the index from LSB. Invert it by doing "31 - msb". */
1462    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1463 
1464    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1465    LLVMValueRef cond =
1466       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1467                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1468 
1469    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1470 }
1471 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1472 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1473                            bool rev)
1474 {
1475    const char *intrin_name;
1476    LLVMTypeRef type;
1477    LLVMValueRef highest_bit;
1478    LLVMValueRef zero;
1479    unsigned bitsize;
1480 
1481    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1482    switch (bitsize) {
1483    case 64:
1484       intrin_name = "llvm.ctlz.i64";
1485       type = ctx->i64;
1486       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1487       zero = ctx->i64_0;
1488       break;
1489    case 32:
1490       intrin_name = "llvm.ctlz.i32";
1491       type = ctx->i32;
1492       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1493       zero = ctx->i32_0;
1494       break;
1495    case 16:
1496       intrin_name = "llvm.ctlz.i16";
1497       type = ctx->i16;
1498       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1499       zero = ctx->i16_0;
1500       break;
1501    case 8:
1502       intrin_name = "llvm.ctlz.i8";
1503       type = ctx->i8;
1504       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1505       zero = ctx->i8_0;
1506       break;
1507    default:
1508       unreachable("invalid bitsize");
1509       break;
1510    }
1511 
1512    LLVMValueRef params[2] = {
1513       arg,
1514       ctx->i1true,
1515    };
1516 
1517    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1518 
1519    if (!rev) {
1520       /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1521        * the index from LSB. Invert it by doing "31 - msb". */
1522       msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1523    }
1524 
1525    if (bitsize == 64) {
1526       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1527    } else if (bitsize < 32) {
1528       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1529    }
1530 
1531    /* check for zero */
1532    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1533                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1534 }
1535 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1536 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1537 {
1538    char name[64], type[64];
1539 
1540    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1541    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1542    LLVMValueRef args[2] = {a, b};
1543    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1544 }
1545 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1546 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1547 {
1548    char name[64], type[64];
1549 
1550    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1551    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1552    LLVMValueRef args[2] = {a, b};
1553    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1554 }
1555 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1556 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1557 {
1558    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1559    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1560 }
1561 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1562 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1563 {
1564    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1565    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1566 }
1567 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1568 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1569 {
1570    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1571    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1572 }
1573 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1574 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1575 {
1576    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1577    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1578 }
1579 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1580 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1581 {
1582    LLVMTypeRef t = LLVMTypeOf(value);
1583    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1584                         LLVMConstReal(t, 1.0));
1585 }
1586 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1587 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1588 {
1589    LLVMValueRef args[9];
1590 
1591    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1592    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1593 
1594    if (a->compr) {
1595       assert(ctx->gfx_level < GFX11);
1596 
1597       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1598       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1599       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1600       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1601 
1602       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1603    } else {
1604       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1605       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1606       args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1607       args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1608       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1609       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1610 
1611       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1612    }
1613 }
1614 
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1615 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1616 {
1617    struct ac_export_args args;
1618 
1619    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1620     * for discard.
1621     */
1622    if (ctx->gfx_level >= GFX10 && !uses_discard)
1623       return;
1624 
1625    args.enabled_channels = 0x0; /* enabled channels */
1626    args.valid_mask = 1;         /* whether the EXEC mask is valid */
1627    args.done = 1;               /* DONE bit */
1628    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1629    args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1630    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
1631    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1632    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1633    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1634    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1635 
1636    ac_build_export(ctx, &args);
1637 }
1638 
ac_num_coords(enum ac_image_dim dim)1639 static unsigned ac_num_coords(enum ac_image_dim dim)
1640 {
1641    switch (dim) {
1642    case ac_image_1d:
1643       return 1;
1644    case ac_image_2d:
1645    case ac_image_1darray:
1646       return 2;
1647    case ac_image_3d:
1648    case ac_image_cube:
1649    case ac_image_2darray:
1650    case ac_image_2dmsaa:
1651       return 3;
1652    case ac_image_2darraymsaa:
1653       return 4;
1654    default:
1655       unreachable("ac_num_coords: bad dim");
1656    }
1657 }
1658 
ac_num_derivs(enum ac_image_dim dim)1659 static unsigned ac_num_derivs(enum ac_image_dim dim)
1660 {
1661    switch (dim) {
1662    case ac_image_1d:
1663    case ac_image_1darray:
1664       return 2;
1665    case ac_image_2d:
1666    case ac_image_2darray:
1667    case ac_image_cube:
1668       return 4;
1669    case ac_image_3d:
1670       return 6;
1671    case ac_image_2dmsaa:
1672    case ac_image_2darraymsaa:
1673    default:
1674       unreachable("derivatives not supported");
1675    }
1676 }
1677 
get_atomic_name(enum ac_atomic_op op)1678 static const char *get_atomic_name(enum ac_atomic_op op)
1679 {
1680    switch (op) {
1681    case ac_atomic_swap:
1682       return "swap";
1683    case ac_atomic_add:
1684       return "add";
1685    case ac_atomic_sub:
1686       return "sub";
1687    case ac_atomic_smin:
1688       return "smin";
1689    case ac_atomic_umin:
1690       return "umin";
1691    case ac_atomic_smax:
1692       return "smax";
1693    case ac_atomic_umax:
1694       return "umax";
1695    case ac_atomic_and:
1696       return "and";
1697    case ac_atomic_or:
1698       return "or";
1699    case ac_atomic_xor:
1700       return "xor";
1701    case ac_atomic_inc_wrap:
1702       return "inc";
1703    case ac_atomic_dec_wrap:
1704       return "dec";
1705    case ac_atomic_fmin:
1706       return "fmin";
1707    case ac_atomic_fmax:
1708       return "fmax";
1709    }
1710    unreachable("bad atomic op");
1711 }
1712 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1713 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1714 {
1715    const char *overload[3] = {"", "", ""};
1716    unsigned num_overloads = 0;
1717    LLVMValueRef args[18];
1718    unsigned num_args = 0;
1719    enum ac_image_dim dim = a->dim;
1720 
1721    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1722    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1723            a->opcode != ac_image_store_mip) ||
1724           a->lod);
1725    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1726           (!a->compare && !a->offset));
1727    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1728            a->opcode == ac_image_get_lod) ||
1729           !a->bias);
1730    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1731           1);
1732    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1733    assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1734                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1735                       a->opcode != ac_image_get_resinfo));
1736    assert(!a->a16 || ctx->gfx_level >= GFX9);
1737    assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1738 
1739    assert(!a->offset ||
1740           ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1741    assert(!a->bias ||
1742           ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
1743    assert(!a->compare ||
1744           ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1745    assert(!a->derivs[0] ||
1746           ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1747            (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1748    assert(!a->coords[0] ||
1749           ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1750            (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1751    assert(!a->lod ||
1752           ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1753            (a->opcode == ac_image_get_resinfo ||
1754             ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1755             ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1756    assert(!a->min_lod ||
1757           ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1758           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1759 
1760    if (a->opcode == ac_image_get_lod) {
1761       switch (dim) {
1762       case ac_image_1darray:
1763          dim = ac_image_1d;
1764          break;
1765       case ac_image_2darray:
1766       case ac_image_cube:
1767          dim = ac_image_2d;
1768          break;
1769       default:
1770          break;
1771       }
1772    }
1773 
1774    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1775                  a->opcode == ac_image_get_lod;
1776    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1777    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1778                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1779    LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1780    uint8_t dmask = a->dmask;
1781    LLVMTypeRef data_type;
1782    char data_type_str[32];
1783 
1784    if (atomic) {
1785       data_type = LLVMTypeOf(a->data[0]);
1786    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1787       /* Image stores might have been shrunk using the format. */
1788       data_type = LLVMTypeOf(a->data[0]);
1789       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1790    } else {
1791       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1792    }
1793 
1794    if (a->tfe) {
1795       data_type = LLVMStructTypeInContext(
1796          ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1797    }
1798 
1799    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1800       args[num_args++] = a->data[0];
1801       if (a->opcode == ac_image_atomic_cmpswap)
1802          args[num_args++] = a->data[1];
1803    }
1804 
1805    if (!atomic)
1806       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1807 
1808    if (a->offset)
1809       args[num_args++] = ac_to_integer(ctx, a->offset);
1810    if (a->bias) {
1811       args[num_args++] = ac_to_float(ctx, a->bias);
1812       overload[num_overloads++] = ".f32";
1813    }
1814    if (a->compare)
1815       args[num_args++] = ac_to_float(ctx, a->compare);
1816    if (a->derivs[0]) {
1817       unsigned count = ac_num_derivs(dim);
1818       for (unsigned i = 0; i < count; ++i)
1819          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1820       overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1821    }
1822    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1823    for (unsigned i = 0; i < num_coords; ++i)
1824       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1825    if (a->lod)
1826       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1827    if (a->min_lod)
1828       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1829 
1830    overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1831 
1832    args[num_args++] = a->resource;
1833    if (sample) {
1834       args[num_args++] = a->sampler;
1835       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1836    }
1837 
1838    args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1839    args[num_args++] = LLVMConstInt(
1840       ctx->i32, get_cache_flags(ctx,
1841                                 a->access |
1842                                 (atomic ? ACCESS_TYPE_ATOMIC :
1843                                  load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1844       false);
1845 
1846    const char *name;
1847    const char *atomic_subop = "";
1848    switch (a->opcode) {
1849    case ac_image_sample:
1850       name = "sample";
1851       break;
1852    case ac_image_gather4:
1853       name = "gather4";
1854       break;
1855    case ac_image_load:
1856       name = "load";
1857       break;
1858    case ac_image_load_mip:
1859       name = "load.mip";
1860       break;
1861    case ac_image_store:
1862       name = "store";
1863       break;
1864    case ac_image_store_mip:
1865       name = "store.mip";
1866       break;
1867    case ac_image_atomic:
1868       name = "atomic.";
1869       atomic_subop = get_atomic_name(a->atomic);
1870       break;
1871    case ac_image_atomic_cmpswap:
1872       name = "atomic.";
1873       atomic_subop = "cmpswap";
1874       break;
1875    case ac_image_get_lod:
1876       name = "getlod";
1877       break;
1878    case ac_image_get_resinfo:
1879       name = "getresinfo";
1880       break;
1881    default:
1882       unreachable("invalid image opcode");
1883    }
1884 
1885    const char *dimname;
1886    switch (dim) {
1887    case ac_image_1d:
1888       dimname = "1d";
1889       break;
1890    case ac_image_2d:
1891       dimname = "2d";
1892       break;
1893    case ac_image_3d:
1894       dimname = "3d";
1895       break;
1896    case ac_image_cube:
1897       dimname = "cube";
1898       break;
1899    case ac_image_1darray:
1900       dimname = "1darray";
1901       break;
1902    case ac_image_2darray:
1903       dimname = "2darray";
1904       break;
1905    case ac_image_2dmsaa:
1906       dimname = "2dmsaa";
1907       break;
1908    case ac_image_2darraymsaa:
1909       dimname = "2darraymsaa";
1910       break;
1911    default:
1912       unreachable("invalid dim");
1913    }
1914 
1915    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1916 
1917    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1918    char intr_name[96];
1919    snprintf(intr_name, sizeof(intr_name),
1920             "llvm.amdgcn.image.%s%s" /* base name */
1921             "%s%s%s%s"               /* sample/gather modifiers */
1922             ".%s.%s%s%s%s",          /* dimension and type overloads */
1923             name, atomic_subop, a->compare ? ".c" : "",
1924             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1925             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1926             data_type_str, overload[0], overload[1], overload[2]);
1927 
1928    LLVMTypeRef retty;
1929    if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1930       retty = ctx->voidt;
1931    else
1932       retty = data_type;
1933 
1934    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1935    if (a->tfe) {
1936       LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1937       LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1938       result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1939    }
1940 
1941    if (!sample && !atomic && retty != ctx->voidt)
1942       result = ac_to_integer(ctx, result);
1943 
1944    return result;
1945 }
1946 
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)1947 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
1948 {
1949    LLVMValueRef samples;
1950 
1951    /* Read the samples from the descriptor directly.
1952     * Hardware doesn't have any instruction for this.
1953     */
1954    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
1955    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
1956    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
1957    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
1958    return samples;
1959 }
1960 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1961 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1962 {
1963    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1964 }
1965 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1966 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1967 {
1968    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1969    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1970 }
1971 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1972 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1973 {
1974    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1975    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1976 }
1977 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1978 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1979                                          LLVMValueRef args[2])
1980 {
1981    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1982    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1983    LLVMValueRef code = LLVMConstInlineAsm(calltype,
1984                                           ctx->gfx_level >= GFX11 ?
1985                                              "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1986                                              "v_cvt_pknorm_i16_f16 $0, $1, $2",
1987                                           "=v,v,v", false, false);
1988    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1989 }
1990 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1991 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
1992                                          LLVMValueRef args[2])
1993 {
1994    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1995    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1996    LLVMValueRef code = LLVMConstInlineAsm(calltype,
1997                                           ctx->gfx_level >= GFX11 ?
1998                                              "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
1999                                              "v_cvt_pknorm_u16_f16 $0, $1, $2",
2000                                           "=v,v,v", false, false);
2001    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2002 }
2003 
2004 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2005 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2006                                  bool hi)
2007 {
2008    assert(bits == 8 || bits == 10 || bits == 16);
2009 
2010    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2011    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2012    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2013    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2014 
2015    /* Clamp. */
2016    if (bits != 16) {
2017       for (int i = 0; i < 2; i++) {
2018          bool alpha = hi && i == 1;
2019          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2020          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2021       }
2022    }
2023 
2024    LLVMValueRef res =
2025       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
2026    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2027 }
2028 
2029 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2030 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2031                                  bool hi)
2032 {
2033    assert(bits == 8 || bits == 10 || bits == 16);
2034 
2035    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2036    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2037 
2038    /* Clamp. */
2039    if (bits != 16) {
2040       for (int i = 0; i < 2; i++) {
2041          bool alpha = hi && i == 1;
2042          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2043       }
2044    }
2045 
2046    LLVMValueRef res =
2047       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
2048    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2049 }
2050 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2051 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2052 {
2053    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
2054 }
2055 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2056 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2057 {
2058    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2059 }
2060 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2061 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2062                           LLVMValueRef width, bool is_signed)
2063 {
2064    LLVMValueRef args[] = {
2065       input,
2066       offset,
2067       width,
2068    };
2069 
2070    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2071                              ctx->i32, args, 3, 0);
2072 }
2073 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2074 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2075                            LLVMValueRef s2)
2076 {
2077    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2078 }
2079 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2080 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2081                            LLVMValueRef s2)
2082 {
2083    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2084    if (ctx->gfx_level >= GFX10)
2085       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
2086 
2087    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2088 }
2089 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2090 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2091 {
2092    if (!wait_flags)
2093       return;
2094 
2095    unsigned expcnt = 7;
2096    unsigned lgkmcnt = 63;
2097    unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2098    unsigned vscnt = 63;
2099 
2100    if (wait_flags & AC_WAIT_EXP)
2101       expcnt = 0;
2102    if (wait_flags & AC_WAIT_LGKM)
2103       lgkmcnt = 0;
2104    if (wait_flags & AC_WAIT_VLOAD)
2105       vmcnt = 0;
2106 
2107    if (wait_flags & AC_WAIT_VSTORE) {
2108       if (ctx->gfx_level >= GFX10)
2109          vscnt = 0;
2110       else
2111          vmcnt = 0;
2112    }
2113 
2114    /* There is no intrinsic for vscnt(0), so use a fence. */
2115    if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2116        vscnt == 0) {
2117       assert(!(wait_flags & AC_WAIT_EXP));
2118       LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2119       return;
2120    }
2121 
2122    unsigned simm16;
2123 
2124    if (ctx->gfx_level >= GFX11)
2125       simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2126    else
2127       simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2128 
2129    LLVMValueRef args[1] = {
2130       LLVMConstInt(ctx->i32, simm16, false),
2131    };
2132    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2133 }
2134 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2135 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2136                            LLVMTypeRef type)
2137 {
2138    unsigned bitsize = ac_get_elem_bits(ctx, type);
2139    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2140    LLVMValueRef one = LLVMConstReal(type, 1.0);
2141    LLVMValueRef result;
2142 
2143    if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2144       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2145        * doesn't expose an intrinsic.
2146        */
2147       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2148    } else {
2149       LLVMTypeRef type;
2150       char *intr;
2151 
2152       if (bitsize == 16) {
2153          intr = "llvm.amdgcn.fmed3.f16";
2154          type = ctx->f16;
2155       } else {
2156          assert(bitsize == 32);
2157          intr = "llvm.amdgcn.fmed3.f32";
2158          type = ctx->f32;
2159       }
2160 
2161       LLVMValueRef params[] = {
2162          zero,
2163          one,
2164          src,
2165       };
2166 
2167       result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2168    }
2169 
2170    if (ctx->gfx_level < GFX9 && bitsize == 32) {
2171       /* Only pre-GFX9 chips do not flush denorms. */
2172       result = ac_build_canonicalize(ctx, result, bitsize);
2173    }
2174 
2175    return result;
2176 }
2177 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2178 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2179 {
2180    LLVMTypeRef type;
2181    char *intr;
2182 
2183    if (bitsize == 16) {
2184       intr = "llvm.amdgcn.fract.f16";
2185       type = ctx->f16;
2186    } else if (bitsize == 32) {
2187       intr = "llvm.amdgcn.fract.f32";
2188       type = ctx->f32;
2189    } else {
2190       intr = "llvm.amdgcn.fract.f64";
2191       type = ctx->f64;
2192    }
2193 
2194    LLVMValueRef params[] = {
2195       src0,
2196    };
2197    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2198 }
2199 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2200 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2201 {
2202 
2203    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2204       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2205       unsigned vec_size = LLVMGetVectorSize(type);
2206       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2207 
2208       for (unsigned i = 0; i < vec_size; i++)
2209          scalars[i] = scalar;
2210       return LLVMConstVector(scalars, vec_size);
2211    }
2212    return LLVMConstInt(type, value, 0);
2213 }
2214 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2215 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2216 {
2217    LLVMTypeRef type = LLVMTypeOf(src0);
2218    LLVMValueRef val;
2219 
2220    /* v_med3 is selected only when max is first. (LLVM bug?) */
2221    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2222    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2223 }
2224 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2225 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2226 {
2227    ac_enable_signed_zeros(ctx);
2228    /* (val + 0) converts negative zero to positive zero. */
2229    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2230    ac_disable_signed_zeros(ctx);
2231    return val;
2232 }
2233 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2234 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2235 {
2236    LLVMTypeRef type = LLVMTypeOf(src);
2237    LLVMValueRef pos, neg, dw[2], val;
2238    unsigned bitsize = ac_get_elem_bits(ctx, type);
2239 
2240    /* The standard version leads to this:
2241     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2242     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2243     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2244     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2245     *
2246     * The isign version:
2247     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2248     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2249     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2250     *
2251     * (src0 + 0) converts negative zero to positive zero.
2252     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2253     *
2254     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2255     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2256     */
2257    if (bitsize == 16 || bitsize == 32) {
2258       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2259       val = ac_build_isign(ctx, val);
2260       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2261    }
2262 
2263    assert(bitsize == 64);
2264    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2265    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2266    dw[0] = ctx->i32_0;
2267    dw[1] = LLVMBuildSelect(
2268       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2269       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2270       "");
2271    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2272 }
2273 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2274 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2275 {
2276    LLVMValueRef result;
2277    unsigned bitsize;
2278 
2279    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2280 
2281    switch (bitsize) {
2282    case 128:
2283       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2284       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2285       break;
2286    case 64:
2287       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2288       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2289       break;
2290    case 32:
2291       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2292       break;
2293    case 16:
2294       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2295       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2296       break;
2297    case 8:
2298       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2299       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2300       break;
2301    default:
2302       unreachable("invalid bitsize");
2303       break;
2304    }
2305 
2306    return result;
2307 }
2308 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2309 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2310 {
2311    LLVMValueRef result;
2312    unsigned bitsize;
2313 
2314    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2315 
2316    switch (bitsize) {
2317    case 64:
2318       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2319       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2320       break;
2321    case 32:
2322       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2323       break;
2324    case 16:
2325       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2326       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2327       break;
2328    case 8:
2329       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2330       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2331       break;
2332    default:
2333       unreachable("invalid bitsize");
2334       break;
2335    }
2336 
2337    return result;
2338 }
2339 
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2340 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2341                                 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2342 {
2343    const char *name = "llvm.amdgcn.sudot4";
2344    LLVMValueRef src[6];
2345 
2346    src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2347    src[1] = s0;
2348    src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2349    src[3] = s1;
2350    src[4] = s2;
2351    src[5] = LLVMConstInt(ctx->i1, clamp, false);
2352 
2353    return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2354 }
2355 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2356 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2357 {
2358    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2359    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2360 }
2361 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2362 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2363 {
2364    unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2365    LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2366    ctx->lds = (struct ac_llvm_pointer) {
2367       .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2368                   LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2369       .pointee_type = type
2370    };
2371 }
2372 
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2373 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2374 {
2375    LLVMValueRef v = ac_build_gep0(ctx, ctx->lds, dw_addr);
2376    return LLVMBuildLoad2(ctx->builder, ctx->i32, v, "");
2377 }
2378 
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2379 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2380 {
2381    value = ac_to_integer(ctx, value);
2382    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2383 }
2384 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2385 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2386 {
2387    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2388    const char *intrin_name;
2389    LLVMTypeRef type;
2390    LLVMValueRef zero;
2391 
2392    switch (src0_bitsize) {
2393    case 64:
2394       intrin_name = "llvm.cttz.i64";
2395       type = ctx->i64;
2396       zero = ctx->i64_0;
2397       break;
2398    case 32:
2399       intrin_name = "llvm.cttz.i32";
2400       type = ctx->i32;
2401       zero = ctx->i32_0;
2402       break;
2403    case 16:
2404       intrin_name = "llvm.cttz.i16";
2405       type = ctx->i16;
2406       zero = ctx->i16_0;
2407       break;
2408    case 8:
2409       intrin_name = "llvm.cttz.i8";
2410       type = ctx->i8;
2411       zero = ctx->i8_0;
2412       break;
2413    default:
2414       unreachable("invalid bitsize");
2415    }
2416 
2417    LLVMValueRef params[2] = {
2418       src0,
2419 
2420       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2421        * add special code to check for x=0. The reason is that
2422        * the LLVM behavior for x=0 is different from what we
2423        * need here. However, LLVM also assumes that ffs(x) is
2424        * in [0, 31], but GLSL expects that ffs(0) = -1, so
2425        * a conditional assignment to handle 0 is still required.
2426        *
2427        * The hardware already implements the correct behavior.
2428        */
2429       ctx->i1true,
2430    };
2431 
2432    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2433 
2434    if (src0_bitsize == 64) {
2435       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2436    } else if (src0_bitsize < 32) {
2437       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2438    }
2439 
2440    /* TODO: We need an intrinsic to skip this conditional. */
2441    /* Check for zero: */
2442    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2443                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2444 }
2445 
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2446 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2447    switch (type) {
2448    case AC_ARG_CONST_PTR:
2449       return ctx->i8;
2450       break;
2451    case AC_ARG_CONST_FLOAT_PTR:
2452       return ctx->f32;
2453       break;
2454    case AC_ARG_CONST_PTR_PTR:
2455       return ac_array_in_const32_addr_space(ctx->i8);
2456       break;
2457    case AC_ARG_CONST_DESC_PTR:
2458       return ctx->v4i32;
2459       break;
2460    case AC_ARG_CONST_IMAGE_PTR:
2461       return ctx->v8i32;
2462    default:
2463       /* Other ac_arg_type values aren't pointers. */
2464       assert(false);
2465       return NULL;
2466    }
2467 }
2468 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2469 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2470 {
2471    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2472 }
2473 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2474 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2475 {
2476    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2477 }
2478 
get_current_flow(struct ac_llvm_context * ctx)2479 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2480 {
2481    if (ctx->flow->depth > 0)
2482       return &ctx->flow->stack[ctx->flow->depth - 1];
2483    return NULL;
2484 }
2485 
get_innermost_loop(struct ac_llvm_context * ctx)2486 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2487 {
2488    for (unsigned i = ctx->flow->depth; i > 0; --i) {
2489       if (ctx->flow->stack[i - 1].loop_entry_block)
2490          return &ctx->flow->stack[i - 1];
2491    }
2492    return NULL;
2493 }
2494 
push_flow(struct ac_llvm_context * ctx)2495 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2496 {
2497    struct ac_llvm_flow *flow;
2498 
2499    if (ctx->flow->depth >= ctx->flow->depth_max) {
2500       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2501 
2502       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2503       ctx->flow->depth_max = new_max;
2504    }
2505 
2506    flow = &ctx->flow->stack[ctx->flow->depth];
2507    ctx->flow->depth++;
2508 
2509    flow->next_block = NULL;
2510    flow->loop_entry_block = NULL;
2511    return flow;
2512 }
2513 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2514 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2515 {
2516    char buf[32];
2517    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2518    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2519 }
2520 
2521 /* Append a basic block at the level of the parent flow.
2522  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2523 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2524 {
2525    assert(ctx->flow->depth >= 1);
2526 
2527    if (ctx->flow->depth >= 2) {
2528       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2529 
2530       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2531    }
2532 
2533    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2534    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2535 }
2536 
2537 /* Emit a branch to the given default target for the current block if
2538  * applicable -- that is, if the current block does not already contain a
2539  * branch from a break or continue.
2540  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2541 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2542 {
2543    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2544       LLVMBuildBr(builder, target);
2545 }
2546 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2547 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2548 {
2549    struct ac_llvm_flow *flow = push_flow(ctx);
2550    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2551    flow->next_block = append_basic_block(ctx, "ENDLOOP");
2552    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2553    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2554    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2555 }
2556 
ac_build_break(struct ac_llvm_context * ctx)2557 void ac_build_break(struct ac_llvm_context *ctx)
2558 {
2559    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2560    LLVMBuildBr(ctx->builder, flow->next_block);
2561 }
2562 
ac_build_continue(struct ac_llvm_context * ctx)2563 void ac_build_continue(struct ac_llvm_context *ctx)
2564 {
2565    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2566    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2567 }
2568 
ac_build_else(struct ac_llvm_context * ctx,int label_id)2569 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2570 {
2571    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2572    LLVMBasicBlockRef endif_block;
2573 
2574    assert(!current_branch->loop_entry_block);
2575 
2576    endif_block = append_basic_block(ctx, "ENDIF");
2577    emit_default_branch(ctx->builder, endif_block);
2578 
2579    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2580    set_basicblock_name(current_branch->next_block, "else", label_id);
2581 
2582    current_branch->next_block = endif_block;
2583 }
2584 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2585 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2586 {
2587    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2588 
2589    assert(!current_branch->loop_entry_block);
2590 
2591    emit_default_branch(ctx->builder, current_branch->next_block);
2592    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2593    set_basicblock_name(current_branch->next_block, "endif", label_id);
2594 
2595    ctx->flow->depth--;
2596 }
2597 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2598 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2599 {
2600    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2601 
2602    assert(current_loop->loop_entry_block);
2603 
2604    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2605 
2606    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2607    set_basicblock_name(current_loop->next_block, "endloop", label_id);
2608    ctx->flow->depth--;
2609 }
2610 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2611 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2612 {
2613    struct ac_llvm_flow *flow = push_flow(ctx);
2614    LLVMBasicBlockRef if_block;
2615 
2616    if_block = append_basic_block(ctx, "IF");
2617    flow->next_block = append_basic_block(ctx, "ELSE");
2618    set_basicblock_name(if_block, "if", label_id);
2619    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2620    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2621 }
2622 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2623 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2624 {
2625    LLVMBuilderRef builder = ac->builder;
2626    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2627    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2628    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2629    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2630    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2631    LLVMValueRef res;
2632 
2633    if (first_instr) {
2634       LLVMPositionBuilderBefore(first_builder, first_instr);
2635    } else {
2636       LLVMPositionBuilderAtEnd(first_builder, first_block);
2637    }
2638 
2639    res = LLVMBuildAlloca(first_builder, type, name);
2640    LLVMDisposeBuilder(first_builder);
2641    return res;
2642 }
2643 
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2644 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2645 {
2646    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
2647    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
2648    return ptr;
2649 }
2650 
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)2651 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
2652 {
2653    LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
2654    LLVMBuildStore(ac->builder, val, ptr);
2655    return ptr;
2656 }
2657 
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)2658 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
2659 {
2660    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2661    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
2662 }
2663 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2664 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2665 {
2666    unsigned num_components = ac_get_llvm_num_components(value);
2667    if (count == num_components)
2668       return value;
2669 
2670    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2671    masks[0] = ctx->i32_0;
2672    masks[1] = ctx->i32_1;
2673    for (unsigned i = 2; i < count; i++)
2674       masks[i] = LLVMConstInt(ctx->i32, i, false);
2675 
2676    if (count == 1)
2677       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2678 
2679    LLVMValueRef swizzle = LLVMConstVector(masks, count);
2680    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2681 }
2682 
2683 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2684 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2685                              unsigned bitwidth)
2686 {
2687    LLVMValueRef value = param;
2688    if (rshift)
2689       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2690 
2691    if (rshift + bitwidth < 32) {
2692       uint64_t mask = (1ull << bitwidth) - 1;
2693       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2694    }
2695 
2696    if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2697       value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2698    return value;
2699 }
2700 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2701 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2702                                        LLVMValueRef lane, bool with_opt_barrier)
2703 {
2704    LLVMTypeRef type = LLVMTypeOf(src);
2705    LLVMValueRef result;
2706 
2707    if (with_opt_barrier)
2708       ac_build_optimization_barrier(ctx, &src, false);
2709 
2710    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2711    if (lane)
2712       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2713 
2714    result =
2715       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2716                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2717 
2718    return LLVMBuildTrunc(ctx->builder, result, type, "");
2719 }
2720 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2721 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2722                                              LLVMValueRef lane, bool with_opt_barrier)
2723 {
2724    LLVMTypeRef src_type = LLVMTypeOf(src);
2725    src = ac_to_integer(ctx, src);
2726    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2727    LLVMValueRef ret;
2728 
2729    if (bits > 32) {
2730       assert(bits % 32 == 0);
2731       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2732       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2733       ret = LLVMGetUndef(vec_type);
2734       for (unsigned i = 0; i < bits / 32; i++) {
2735          LLVMValueRef ret_comp;
2736 
2737          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2738 
2739          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2740 
2741          ret =
2742             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2743       }
2744    } else {
2745       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2746    }
2747 
2748    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2749       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2750    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2751 }
2752 
2753 /**
2754  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
2755  *
2756  * The optimization barrier is not needed if the value is the same in all lanes
2757  * or if this is called in the outermost block.
2758  *
2759  * @param ctx
2760  * @param src
2761  * @param lane - id of the lane or NULL for the first active lane
2762  * @return value of the lane
2763  */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2764 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
2765                                               LLVMValueRef lane)
2766 {
2767    return ac_build_readlane_common(ctx, src, lane, false);
2768 }
2769 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2770 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2771 {
2772    return ac_build_readlane_common(ctx, src, lane, true);
2773 }
2774 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2775 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2776                                 LLVMValueRef lane)
2777 {
2778    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2779                              (LLVMValueRef[]){value, lane, src}, 3, 0);
2780 }
2781 
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2782 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2783 {
2784    LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2785    LLVMValueRef val;
2786 
2787    if (ctx->wave_size == 32) {
2788       if (LLVMTypeOf(mask) == ctx->i64)
2789          mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2790 
2791       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2792                                (LLVMValueRef[]){mask, add}, 2, 0);
2793    } else {
2794       LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2795       LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2796       LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2797       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2798                                (LLVMValueRef[]){mask_lo, add}, 2, 0);
2799       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2800                                2, 0);
2801    }
2802 
2803    if (add == ctx->i32_0)
2804       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2805 
2806    if (LLVM_VERSION_MAJOR < 16) {
2807       /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2808        * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2809        */
2810       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2811       val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2812    }
2813 
2814    return val;
2815 }
2816 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2817 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2818 {
2819    return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2820 }
2821 
2822 enum dpp_ctrl
2823 {
2824    _dpp_quad_perm = 0x000,
2825    _dpp_row_sl = 0x100,
2826    _dpp_row_sr = 0x110,
2827    _dpp_row_rr = 0x120,
2828    dpp_wf_sl1 = 0x130,
2829    dpp_wf_rl1 = 0x134,
2830    dpp_wf_sr1 = 0x138,
2831    dpp_wf_rr1 = 0x13C,
2832    dpp_row_mirror = 0x140,
2833    dpp_row_half_mirror = 0x141,
2834    dpp_row_bcast15 = 0x142,
2835    dpp_row_bcast31 = 0x143
2836 };
2837 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2838 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2839                                           unsigned lane3)
2840 {
2841    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2842    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2843 }
2844 
dpp_row_sr(unsigned amount)2845 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2846 {
2847    assert(amount > 0 && amount < 16);
2848    return _dpp_row_sr | amount;
2849 }
2850 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2851 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2852                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2853                                   bool bound_ctrl)
2854 {
2855    LLVMTypeRef type = LLVMTypeOf(src);
2856    LLVMValueRef res;
2857 
2858    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2859    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2860 
2861    res = ac_build_intrinsic(
2862       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2863       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2864                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2865                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2866       6, 0);
2867 
2868    return LLVMBuildTrunc(ctx->builder, res, type, "");
2869 }
2870 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2871 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2872                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2873                                  bool bound_ctrl)
2874 {
2875    LLVMTypeRef src_type = LLVMTypeOf(src);
2876    src = ac_to_integer(ctx, src);
2877    old = ac_to_integer(ctx, old);
2878    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2879    LLVMValueRef ret;
2880    if (bits > 32) {
2881       assert(bits % 32 == 0);
2882       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2883       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2884       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2885       ret = LLVMGetUndef(vec_type);
2886       for (unsigned i = 0; i < bits / 32; i++) {
2887          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2888          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2889          LLVMValueRef ret_comp =
2890             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2891          ret =
2892             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2893       }
2894    } else {
2895       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2896    }
2897    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2898 }
2899 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2900 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2901                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
2902 {
2903    LLVMTypeRef type = LLVMTypeOf(src);
2904    LLVMValueRef result;
2905 
2906    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2907 
2908    LLVMValueRef args[6] = {
2909       src,
2910       src,
2911       LLVMConstInt(ctx->i32, sel, false),
2912       LLVMConstInt(ctx->i32, sel >> 32, false),
2913       ctx->i1true, /* fi */
2914       bound_ctrl ? ctx->i1true : ctx->i1false,
2915    };
2916 
2917    result =
2918       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2919                          ctx->i32, args, 6, 0);
2920 
2921    return LLVMBuildTrunc(ctx->builder, result, type, "");
2922 }
2923 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2924 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2925                                         bool exchange_rows, bool bound_ctrl)
2926 {
2927    LLVMTypeRef src_type = LLVMTypeOf(src);
2928    src = ac_to_integer(ctx, src);
2929    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2930    LLVMValueRef ret;
2931    if (bits > 32) {
2932       assert(bits % 32 == 0);
2933       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2934       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2935       ret = LLVMGetUndef(vec_type);
2936       for (unsigned i = 0; i < bits / 32; i++) {
2937          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2938          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2939          ret =
2940             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2941       }
2942    } else {
2943       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2944    }
2945    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2946 }
2947 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2948 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2949 {
2950    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2951    return and_mask | (or_mask << 5) | (xor_mask << 10);
2952 }
2953 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2954 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2955                                          unsigned mask)
2956 {
2957    LLVMTypeRef src_type = LLVMTypeOf(src);
2958    LLVMValueRef ret;
2959 
2960    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2961 
2962    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2963                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2964                             0);
2965 
2966    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2967 }
2968 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2969 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2970 {
2971    LLVMTypeRef src_type = LLVMTypeOf(src);
2972    src = ac_to_integer(ctx, src);
2973    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2974    LLVMValueRef ret;
2975    if (bits > 32) {
2976       assert(bits % 32 == 0);
2977       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2978       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2979       ret = LLVMGetUndef(vec_type);
2980       for (unsigned i = 0; i < bits / 32; i++) {
2981          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2982          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
2983          ret =
2984             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2985       }
2986    } else {
2987       ret = _ac_build_ds_swizzle(ctx, src, mask);
2988    }
2989    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2990 }
2991 
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)2992 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
2993 {
2994    LLVMTypeRef src_type = LLVMTypeOf(src);
2995    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2996    char name[32], type[8];
2997    LLVMValueRef ret;
2998 
2999    src = ac_to_integer(ctx, src);
3000 
3001    if (bitsize < 32)
3002       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3003 
3004    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3005    snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
3006    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
3007 
3008    if (bitsize < 32)
3009       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3010 
3011    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3012 }
3013 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3014 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3015 {
3016    return ac_build_mode(ctx, src, "wwm");
3017 }
3018 
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)3019 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
3020 {
3021    return ac_build_mode(ctx, src, "wqm");
3022 }
3023 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3024 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3025                                           LLVMValueRef inactive)
3026 {
3027    char name[33], type[8];
3028    LLVMTypeRef src_type = LLVMTypeOf(src);
3029    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3030    src = ac_to_integer(ctx, src);
3031    inactive = ac_to_integer(ctx, inactive);
3032 
3033    if (bitsize < 32) {
3034       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3035       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3036    }
3037 
3038    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3039    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3040    LLVMValueRef ret =
3041       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
3042    if (bitsize < 32)
3043       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3044 
3045    return ret;
3046 }
3047 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3048 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3049                                            unsigned type_size)
3050 {
3051 
3052    if (type_size == 0) {
3053       switch (op) {
3054       case nir_op_ior:
3055       case nir_op_ixor:
3056          return ctx->i1false;
3057       case nir_op_iand:
3058          return ctx->i1true;
3059       default:
3060          unreachable("bad reduction intrinsic");
3061       }
3062    } else if (type_size == 1) {
3063       switch (op) {
3064       case nir_op_iadd:
3065          return ctx->i8_0;
3066       case nir_op_imul:
3067          return ctx->i8_1;
3068       case nir_op_imin:
3069          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3070       case nir_op_umin:
3071          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3072       case nir_op_imax:
3073          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3074       case nir_op_umax:
3075          return ctx->i8_0;
3076       case nir_op_iand:
3077          return LLVMConstInt(ctx->i8, -1, 0);
3078       case nir_op_ior:
3079          return ctx->i8_0;
3080       case nir_op_ixor:
3081          return ctx->i8_0;
3082       default:
3083          unreachable("bad reduction intrinsic");
3084       }
3085    } else if (type_size == 2) {
3086       switch (op) {
3087       case nir_op_iadd:
3088          return ctx->i16_0;
3089       case nir_op_fadd:
3090          return ctx->f16_0;
3091       case nir_op_imul:
3092          return ctx->i16_1;
3093       case nir_op_fmul:
3094          return ctx->f16_1;
3095       case nir_op_imin:
3096          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3097       case nir_op_umin:
3098          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3099       case nir_op_fmin:
3100          return LLVMConstReal(ctx->f16, INFINITY);
3101       case nir_op_imax:
3102          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3103       case nir_op_umax:
3104          return ctx->i16_0;
3105       case nir_op_fmax:
3106          return LLVMConstReal(ctx->f16, -INFINITY);
3107       case nir_op_iand:
3108          return LLVMConstInt(ctx->i16, -1, 0);
3109       case nir_op_ior:
3110          return ctx->i16_0;
3111       case nir_op_ixor:
3112          return ctx->i16_0;
3113       default:
3114          unreachable("bad reduction intrinsic");
3115       }
3116    } else if (type_size == 4) {
3117       switch (op) {
3118       case nir_op_iadd:
3119          return ctx->i32_0;
3120       case nir_op_fadd:
3121          return ctx->f32_0;
3122       case nir_op_imul:
3123          return ctx->i32_1;
3124       case nir_op_fmul:
3125          return ctx->f32_1;
3126       case nir_op_imin:
3127          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3128       case nir_op_umin:
3129          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3130       case nir_op_fmin:
3131          return LLVMConstReal(ctx->f32, INFINITY);
3132       case nir_op_imax:
3133          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3134       case nir_op_umax:
3135          return ctx->i32_0;
3136       case nir_op_fmax:
3137          return LLVMConstReal(ctx->f32, -INFINITY);
3138       case nir_op_iand:
3139          return LLVMConstInt(ctx->i32, -1, 0);
3140       case nir_op_ior:
3141          return ctx->i32_0;
3142       case nir_op_ixor:
3143          return ctx->i32_0;
3144       default:
3145          unreachable("bad reduction intrinsic");
3146       }
3147    } else { /* type_size == 64bit */
3148       switch (op) {
3149       case nir_op_iadd:
3150          return ctx->i64_0;
3151       case nir_op_fadd:
3152          return ctx->f64_0;
3153       case nir_op_imul:
3154          return ctx->i64_1;
3155       case nir_op_fmul:
3156          return ctx->f64_1;
3157       case nir_op_imin:
3158          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3159       case nir_op_umin:
3160          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3161       case nir_op_fmin:
3162          return LLVMConstReal(ctx->f64, INFINITY);
3163       case nir_op_imax:
3164          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3165       case nir_op_umax:
3166          return ctx->i64_0;
3167       case nir_op_fmax:
3168          return LLVMConstReal(ctx->f64, -INFINITY);
3169       case nir_op_iand:
3170          return LLVMConstInt(ctx->i64, -1, 0);
3171       case nir_op_ior:
3172          return ctx->i64_0;
3173       case nir_op_ixor:
3174          return ctx->i64_0;
3175       default:
3176          unreachable("bad reduction intrinsic");
3177       }
3178    }
3179 }
3180 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3181 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3182                                     nir_op op)
3183 {
3184    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3185    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3186    switch (op) {
3187    case nir_op_iadd:
3188       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3189    case nir_op_fadd:
3190       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3191    case nir_op_imul:
3192       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3193    case nir_op_fmul:
3194       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3195    case nir_op_imin:
3196       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3197                              lhs, rhs, "");
3198    case nir_op_umin:
3199       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3200                              lhs, rhs, "");
3201    case nir_op_fmin:
3202       return ac_build_intrinsic(
3203          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3204          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3205    case nir_op_imax:
3206       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3207                              lhs, rhs, "");
3208    case nir_op_umax:
3209       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3210                              lhs, rhs, "");
3211    case nir_op_fmax:
3212       return ac_build_intrinsic(
3213          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3214          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3215    case nir_op_iand:
3216       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3217    case nir_op_ior:
3218       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3219    case nir_op_ixor:
3220       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3221    default:
3222       unreachable("bad reduction intrinsic");
3223    }
3224 }
3225 
3226 /**
3227  * \param src The value to shift.
3228  * \param identity The value to use the first lane.
3229  * \param maxprefix specifies that the result only needs to be correct for a
3230  *     prefix of this many threads
3231  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3232  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3233 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3234                                                LLVMValueRef identity, unsigned maxprefix)
3235 {
3236    if (ctx->gfx_level >= GFX10) {
3237       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3238       LLVMValueRef active, tmp1, tmp2;
3239       LLVMValueRef tid = ac_get_thread_id(ctx);
3240 
3241       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3242 
3243       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3244 
3245       if (maxprefix > 32) {
3246          active =
3247             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3248 
3249          tmp2 = LLVMBuildSelect(ctx->builder, active,
3250                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3251                                 tmp2, "");
3252 
3253          active = LLVMBuildOr(
3254             ctx->builder, active,
3255             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3256                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3257                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3258             "");
3259          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3260       } else if (maxprefix > 16) {
3261          active =
3262             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3263 
3264          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3265       }
3266    } else if (ctx->gfx_level >= GFX8) {
3267       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3268    }
3269 
3270    /* wavefront shift_right by 1 on SI/CI */
3271    LLVMValueRef active, tmp1, tmp2;
3272    LLVMValueRef tid = ac_get_thread_id(ctx);
3273    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3274    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3275    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3276                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3277                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3278    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3279    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3280    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3281                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3282                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3283    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3284    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3285    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3286                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3287                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3288    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3289    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3290    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3291    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3292    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3293    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3294 }
3295 
3296 /**
3297  * \param maxprefix specifies that the result only needs to be correct for a
3298  *     prefix of this many threads
3299  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3300 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3301                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3302 {
3303    LLVMValueRef result, tmp;
3304 
3305    if (!inclusive)
3306       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3307 
3308    result = src;
3309 
3310    if (ctx->gfx_level <= GFX7) {
3311       assert(maxprefix == 64);
3312       LLVMValueRef tid = ac_get_thread_id(ctx);
3313       LLVMValueRef active;
3314       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3315       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3316                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3317       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3318       result = ac_build_alu_op(ctx, result, tmp, op);
3319       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3320       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3321                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3322                              ctx->i32_0, "");
3323       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3324       result = ac_build_alu_op(ctx, result, tmp, op);
3325       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3326       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3327                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3328                              ctx->i32_0, "");
3329       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3330       result = ac_build_alu_op(ctx, result, tmp, op);
3331       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3332       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3333                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3334                              ctx->i32_0, "");
3335       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3336       result = ac_build_alu_op(ctx, result, tmp, op);
3337       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3338       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3339                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3340                              ctx->i32_0, "");
3341       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3342       result = ac_build_alu_op(ctx, result, tmp, op);
3343       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3344       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3345                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3346                              ctx->i32_0, "");
3347       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3348       result = ac_build_alu_op(ctx, result, tmp, op);
3349       return result;
3350    }
3351 
3352    if (maxprefix <= 1)
3353       return result;
3354    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3355    result = ac_build_alu_op(ctx, result, tmp, op);
3356    if (maxprefix <= 2)
3357       return result;
3358    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3359    result = ac_build_alu_op(ctx, result, tmp, op);
3360    if (maxprefix <= 3)
3361       return result;
3362    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3363    result = ac_build_alu_op(ctx, result, tmp, op);
3364    if (maxprefix <= 4)
3365       return result;
3366    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3367    result = ac_build_alu_op(ctx, result, tmp, op);
3368    if (maxprefix <= 8)
3369       return result;
3370    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3371    result = ac_build_alu_op(ctx, result, tmp, op);
3372    if (maxprefix <= 16)
3373       return result;
3374 
3375    if (ctx->gfx_level >= GFX10) {
3376       LLVMValueRef tid = ac_get_thread_id(ctx);
3377       LLVMValueRef active;
3378 
3379       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3380 
3381       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3382                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3383                              ctx->i32_0, "");
3384 
3385       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3386 
3387       result = ac_build_alu_op(ctx, result, tmp, op);
3388 
3389       if (maxprefix <= 32)
3390          return result;
3391 
3392       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3393 
3394       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3395 
3396       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3397 
3398       result = ac_build_alu_op(ctx, result, tmp, op);
3399       return result;
3400    }
3401 
3402    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3403    result = ac_build_alu_op(ctx, result, tmp, op);
3404    if (maxprefix <= 32)
3405       return result;
3406    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3407    result = ac_build_alu_op(ctx, result, tmp, op);
3408    return result;
3409 }
3410 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3411 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3412 {
3413    LLVMValueRef result;
3414 
3415    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3416       LLVMBuilderRef builder = ctx->builder;
3417       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3418       result = ac_build_ballot(ctx, src);
3419       result = ac_build_mbcnt(ctx, result);
3420       result = LLVMBuildAdd(builder, result, src, "");
3421       return result;
3422    }
3423 
3424    ac_build_optimization_barrier(ctx, &src, false);
3425 
3426    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3427    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3428                              LLVMTypeOf(identity), "");
3429    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3430 
3431    return ac_build_wwm(ctx, result);
3432 }
3433 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3434 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3435 {
3436    LLVMValueRef result;
3437 
3438    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3439       LLVMBuilderRef builder = ctx->builder;
3440       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3441       result = ac_build_ballot(ctx, src);
3442       result = ac_build_mbcnt(ctx, result);
3443       return result;
3444    }
3445 
3446    ac_build_optimization_barrier(ctx, &src, false);
3447 
3448    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3449    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3450                              LLVMTypeOf(identity), "");
3451    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3452 
3453    return ac_build_wwm(ctx, result);
3454 }
3455 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3456 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3457                              unsigned cluster_size)
3458 {
3459    if (cluster_size == 1)
3460       return src;
3461    ac_build_optimization_barrier(ctx, &src, false);
3462    LLVMValueRef result, swap;
3463    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3464    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3465                              LLVMTypeOf(identity), "");
3466    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3467    result = ac_build_alu_op(ctx, result, swap, op);
3468    if (cluster_size == 2)
3469       return ac_build_wwm(ctx, result);
3470 
3471    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3472    result = ac_build_alu_op(ctx, result, swap, op);
3473    if (cluster_size == 4)
3474       return ac_build_wwm(ctx, result);
3475 
3476    if (ctx->gfx_level >= GFX8)
3477       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3478    else
3479       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3480    result = ac_build_alu_op(ctx, result, swap, op);
3481    if (cluster_size == 8)
3482       return ac_build_wwm(ctx, result);
3483 
3484    if (ctx->gfx_level >= GFX8)
3485       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3486    else
3487       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3488    result = ac_build_alu_op(ctx, result, swap, op);
3489    if (cluster_size == 16)
3490       return ac_build_wwm(ctx, result);
3491 
3492    if (ctx->gfx_level >= GFX10)
3493       swap = ac_build_permlane16(ctx, result, 0, true, false);
3494    else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3495       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3496    else
3497       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3498    result = ac_build_alu_op(ctx, result, swap, op);
3499    if (cluster_size == 32)
3500       return ac_build_wwm(ctx, result);
3501 
3502    if (ctx->gfx_level >= GFX8) {
3503       if (ctx->wave_size == 64) {
3504          if (ctx->gfx_level >= GFX10)
3505             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3506          else
3507             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3508          result = ac_build_alu_op(ctx, result, swap, op);
3509          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3510       }
3511 
3512       return ac_build_wwm(ctx, result);
3513    } else {
3514       swap = ac_build_readlane(ctx, result, ctx->i32_0);
3515       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3516       result = ac_build_alu_op(ctx, result, swap, op);
3517       return ac_build_wwm(ctx, result);
3518    }
3519 }
3520 
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3521 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3522                                              LLVMValueRef *arg0, LLVMValueRef *arg1)
3523 {
3524    LLVMValueRef tid;
3525    LLVMValueRef src0, src1;
3526    LLVMValueRef tmp0;
3527    LLVMValueRef params[2];
3528    LLVMValueRef is_even;
3529 
3530    src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3531    src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3532 
3533    /* swap odd,even lanes of arg_0*/
3534    params[0] = src0;
3535    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3536    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3537                              ctx->i32, params, 2, 0);
3538 
3539    /* swap even lanes between arg_0 and arg_1 */
3540    tid = ac_get_thread_id(ctx);
3541    is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3542                            LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3543                            ctx->i32_0, "");
3544    tmp0 = src0;
3545    src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3546    src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3547 
3548    /* swap odd,even lanes again for arg_0*/
3549    params[0] = src0;
3550    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3551    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3552                              ctx->i32, params, 2, 0);
3553 
3554    *arg0 = src0;
3555    *arg1 = src1;
3556 }
3557 
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3558 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3559                                      struct ac_export_args *mrt0,
3560                                      struct ac_export_args *mrt1)
3561 {
3562    assert(ctx->gfx_level >= GFX11);
3563    assert(mrt0->enabled_channels == mrt1->enabled_channels);
3564 
3565    for (int i = 0; i < 4; i++) {
3566       if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3567          _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3568    }
3569 }
3570 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3571 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3572                                    unsigned lane1, unsigned lane2, unsigned lane3)
3573 {
3574    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3575    if (ctx->gfx_level >= GFX8) {
3576       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3577    } else {
3578       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3579    }
3580 }
3581 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3582 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3583 {
3584    LLVMTypeRef type = LLVMTypeOf(src);
3585    LLVMValueRef result;
3586 
3587    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3588    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3589 
3590    result =
3591       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3592    return LLVMBuildTrunc(ctx->builder, result, type, "");
3593 }
3594 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3595 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3596 {
3597    LLVMTypeRef type;
3598    char *intr;
3599 
3600    if (bitsize == 16) {
3601       intr = "llvm.amdgcn.frexp.exp.i16.f16";
3602       type = ctx->i16;
3603    } else if (bitsize == 32) {
3604       intr = "llvm.amdgcn.frexp.exp.i32.f32";
3605       type = ctx->i32;
3606    } else {
3607       intr = "llvm.amdgcn.frexp.exp.i32.f64";
3608       type = ctx->i32;
3609    }
3610 
3611    LLVMValueRef params[] = {
3612       src0,
3613    };
3614    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3615 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3616 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3617 {
3618    LLVMTypeRef type;
3619    char *intr;
3620 
3621    if (bitsize == 16) {
3622       intr = "llvm.amdgcn.frexp.mant.f16";
3623       type = ctx->f16;
3624    } else if (bitsize == 32) {
3625       intr = "llvm.amdgcn.frexp.mant.f32";
3626       type = ctx->f32;
3627    } else {
3628       intr = "llvm.amdgcn.frexp.mant.f64";
3629       type = ctx->f64;
3630    }
3631 
3632    LLVMValueRef params[] = {
3633       src0,
3634    };
3635    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3636 }
3637 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3638 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3639 {
3640    LLVMTypeRef type;
3641    char *intr;
3642 
3643    if (bitsize == 16) {
3644       intr = "llvm.canonicalize.f16";
3645       type = ctx->f16;
3646    } else if (bitsize == 32) {
3647       intr = "llvm.canonicalize.f32";
3648       type = ctx->f32;
3649    } else {
3650       intr = "llvm.canonicalize.f64";
3651       type = ctx->f64;
3652    }
3653 
3654    LLVMValueRef params[] = {
3655       src0,
3656    };
3657    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3658 }
3659 
3660 /*
3661  * this takes an I,J coordinate pair,
3662  * and works out the X and Y derivatives.
3663  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3664  */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)3665 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
3666 {
3667    LLVMValueRef result[4], a;
3668    unsigned i;
3669 
3670    for (i = 0; i < 2; i++) {
3671       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
3672       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
3673       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
3674    }
3675    return ac_build_gather_values(ctx, result, 4);
3676 }
3677 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3678 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3679 {
3680    LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3681 
3682    return LLVMBuildNot(ctx->builder, result, "");
3683 }
3684 
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3685 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3686                            unsigned num_args)
3687 {
3688    LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3689    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3690    return ret;
3691 }
3692 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3693 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3694                      LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3695                      struct ac_export_args *args)
3696 {
3697    unsigned mask = 0;
3698    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3699                                                 mrt0_alpha != NULL);
3700 
3701    assert(depth || stencil || samplemask);
3702 
3703    memset(args, 0, sizeof(*args));
3704 
3705    if (is_last) {
3706       args->valid_mask = 1; /* whether the EXEC mask is valid */
3707       args->done = 1;       /* DONE bit */
3708    }
3709 
3710    /* Specify the target we are exporting */
3711    args->target = V_008DFC_SQ_EXP_MRTZ;
3712 
3713    args->compr = 0;                       /* COMP flag */
3714    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3715    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3716    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3717    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3718 
3719    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3720       assert(!depth);
3721       args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3722 
3723       if (stencil) {
3724          /* Stencil should be in X[23:16]. */
3725          stencil = ac_to_integer(ctx, stencil);
3726          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3727          args->out[0] = ac_to_float(ctx, stencil);
3728          mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3729       }
3730       if (samplemask) {
3731          /* SampleMask should be in Y[15:0]. */
3732          args->out[1] = samplemask;
3733          mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3734       }
3735    } else {
3736       if (depth) {
3737          args->out[0] = depth;
3738          mask |= 0x1;
3739       }
3740       if (stencil) {
3741          args->out[1] = stencil;
3742          mask |= 0x2;
3743       }
3744       if (samplemask) {
3745          args->out[2] = samplemask;
3746          mask |= 0x4;
3747       }
3748       if (mrt0_alpha) {
3749          args->out[3] = mrt0_alpha;
3750          mask |= 0x8;
3751       }
3752    }
3753 
3754    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3755     * at the X writemask component. */
3756    if (ctx->gfx_level == GFX6 &&
3757        ctx->info->family != CHIP_OLAND &&
3758        ctx->info->family != CHIP_HAINAN)
3759       mask |= 0x1;
3760 
3761    /* Specify which components to enable */
3762    args->enabled_channels = mask;
3763 }
3764 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3765 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3766 {
3767    LLVMTypeRef base;
3768    switch (type) {
3769       case AC_ARG_FLOAT:
3770          return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3771       case AC_ARG_INT:
3772          return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3773       case AC_ARG_CONST_PTR:
3774          base = ctx->i8;
3775          break;
3776       case AC_ARG_CONST_FLOAT_PTR:
3777          base = ctx->f32;
3778          break;
3779       case AC_ARG_CONST_PTR_PTR:
3780          base = ac_array_in_const32_addr_space(ctx->i8);
3781          break;
3782       case AC_ARG_CONST_DESC_PTR:
3783          base = ctx->v4i32;
3784          break;
3785       case AC_ARG_CONST_IMAGE_PTR:
3786          base = ctx->v8i32;
3787          break;
3788       default:
3789          assert(false);
3790          return NULL;
3791    }
3792 
3793    assert(base);
3794    if (size == 1) {
3795       return ac_array_in_const32_addr_space(base);
3796    } else {
3797       assert(size == 2);
3798       return ac_array_in_const_addr_space(base);
3799    }
3800 }
3801 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3802 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3803                            enum ac_llvm_calling_convention convention, const char *name,
3804                            LLVMTypeRef ret_type, LLVMModuleRef module)
3805 {
3806    LLVMTypeRef arg_types[AC_MAX_ARGS];
3807    enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3808 
3809    /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3810     * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3811     */
3812    unsigned arg_count = 0;
3813    for (unsigned i = 0; i < args->arg_count; i++) {
3814       if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3815          ctx->ring_offsets_index = i;
3816          continue;
3817       }
3818       arg_regfiles[arg_count] = args->args[i].file;
3819       arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3820    }
3821 
3822    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3823 
3824    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3825    LLVMBasicBlockRef main_function_body =
3826       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3827    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3828 
3829    LLVMSetFunctionCallConv(main_function, convention);
3830    for (unsigned i = 0; i < arg_count; ++i) {
3831       LLVMValueRef P = LLVMGetParam(main_function, i);
3832 
3833       if (arg_regfiles[i] != AC_ARG_SGPR)
3834          continue;
3835 
3836       ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3837 
3838       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3839          ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3840          ac_add_attr_dereferenceable(P, UINT64_MAX);
3841          ac_add_attr_alignment(P, 4);
3842       }
3843    }
3844 
3845    if (args->ring_offsets.used) {
3846       ctx->ring_offsets =
3847          ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3848                             LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3849       ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3850                                            ac_array_in_const_addr_space(ctx->v4i32), "");
3851    }
3852 
3853    ctx->main_function = (struct ac_llvm_pointer) {
3854       .value = main_function,
3855       .pointee_type = main_function_type
3856    };
3857 
3858    /* Enable denormals for FP16 and FP64: */
3859    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3860    /* Disable denormals for FP32: */
3861    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3862                                       "preserve-sign,preserve-sign");
3863 
3864    if (convention == AC_LLVM_AMDGPU_PS) {
3865       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3866                                          ctx->exports_mrtz ? "1" : "0");
3867       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3868                                          ctx->exports_color_null ? "1" : "0");
3869    }
3870 
3871    return ctx->main_function;
3872 }
3873 
ac_build_s_endpgm(struct ac_llvm_context * ctx)3874 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
3875 {
3876    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3877    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
3878    LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
3879 }
3880 
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3881 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3882 {
3883    LLVMValueRef args[2] = {
3884       a,
3885       LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3886    };
3887    return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3888 }
3889