• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21 
22 #include <assert.h>
23 #include <stdio.h>
24 
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26 
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28  */
29 struct ac_llvm_flow {
30    /* Loop exit or next part of if/else/endif. */
31    LLVMBasicBlockRef next_block;
32    LLVMBasicBlockRef loop_entry_block;
33 };
34 
35 /* Initialize module-independent parts of the context.
36  *
37  * The caller is responsible for initializing ctx::module and ctx::builder.
38  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40                           const struct radeon_info *info, enum ac_float_mode float_mode,
41                           unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42                           bool exports_mrtz)
43 {
44    ctx->context = LLVMContextCreate();
45 
46    ctx->info = info;
47    ctx->gfx_level = info->gfx_level;
48    ctx->wave_size = wave_size;
49    ctx->ballot_mask_bits = ballot_mask_bits;
50    ctx->float_mode = float_mode;
51    ctx->exports_color_null = exports_color_null;
52    ctx->exports_mrtz = exports_mrtz;
53    ctx->module = ac_create_module(compiler->tm, ctx->context);
54    ctx->builder = ac_create_builder(ctx->context, float_mode);
55 
56    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63    ctx->intptr = ctx->i32;
64    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67    ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81 
82    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98 
99    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101 
102    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105    ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106 
107    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108 
109    LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110    ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111 
112    ctx->flow = calloc(1, sizeof(*ctx->flow));
113 
114    ctx->ring_offsets_index = INT32_MAX;
115 }
116 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119    free(ctx->flow->stack);
120    free(ctx->flow);
121    ctx->flow = NULL;
122 
123    LLVMDisposeBuilder(ctx->builder);
124 }
125 
ac_get_llvm_num_components(LLVMValueRef value)126 int ac_get_llvm_num_components(LLVMValueRef value)
127 {
128    LLVMTypeRef type = LLVMTypeOf(value);
129    unsigned num_components =
130       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
131    return num_components;
132 }
133 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)134 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
135 {
136    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
137       assert(index == 0);
138       return value;
139    }
140 
141    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
142 }
143 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)144 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
145 {
146    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
147       type = LLVMGetElementType(type);
148 
149    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
150       return LLVMGetIntTypeWidth(type);
151 
152    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
153       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
154          return 32;
155    }
156 
157    if (type == ctx->f16)
158       return 16;
159    if (type == ctx->f32)
160       return 32;
161    if (type == ctx->f64)
162       return 64;
163 
164    unreachable("Unhandled type kind in get_elem_bits");
165 }
166 
ac_get_type_size(LLVMTypeRef type)167 unsigned ac_get_type_size(LLVMTypeRef type)
168 {
169    LLVMTypeKind kind = LLVMGetTypeKind(type);
170 
171    switch (kind) {
172    case LLVMIntegerTypeKind:
173       return LLVMGetIntTypeWidth(type) / 8;
174    case LLVMHalfTypeKind:
175       return 2;
176    case LLVMFloatTypeKind:
177       return 4;
178    case LLVMDoubleTypeKind:
179       return 8;
180    case LLVMPointerTypeKind:
181       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
182          return 4;
183       return 8;
184    case LLVMVectorTypeKind:
185       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
186    case LLVMArrayTypeKind:
187       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
188    default:
189       assert(0);
190       return 0;
191    }
192 }
193 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)194 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
195 {
196    if (t == ctx->i1)
197       return ctx->i1;
198    else if (t == ctx->i8)
199       return ctx->i8;
200    else if (t == ctx->f16 || t == ctx->i16)
201       return ctx->i16;
202    else if (t == ctx->f32 || t == ctx->i32)
203       return ctx->i32;
204    else if (t == ctx->f64 || t == ctx->i64)
205       return ctx->i64;
206    else
207       unreachable("Unhandled integer size");
208 }
209 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)210 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
211 {
212    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
213       LLVMTypeRef elem_type = LLVMGetElementType(t);
214       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
215    }
216    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
217       switch (LLVMGetPointerAddressSpace(t)) {
218       case AC_ADDR_SPACE_GLOBAL:
219       case AC_ADDR_SPACE_CONST:
220          return ctx->i64;
221       case AC_ADDR_SPACE_CONST_32BIT:
222       case AC_ADDR_SPACE_LDS:
223          return ctx->i32;
224       default:
225          unreachable("unhandled address space");
226       }
227    }
228    return to_integer_type_scalar(ctx, t);
229 }
230 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)231 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
232 {
233    LLVMTypeRef type = LLVMTypeOf(v);
234    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
235       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236    }
237    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
238 }
239 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242    LLVMTypeRef type = LLVMTypeOf(v);
243    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
244       return v;
245    return ac_to_integer(ctx, v);
246 }
247 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)248 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
249 {
250    if (t == ctx->i8)
251       return ctx->i8;
252    else if (t == ctx->i16 || t == ctx->f16)
253       return ctx->f16;
254    else if (t == ctx->i32 || t == ctx->f32)
255       return ctx->f32;
256    else if (t == ctx->i64 || t == ctx->f64)
257       return ctx->f64;
258    else
259       unreachable("Unhandled float size");
260 }
261 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)262 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
263 {
264    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
265       LLVMTypeRef elem_type = LLVMGetElementType(t);
266       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
267    }
268    return to_float_type_scalar(ctx, t);
269 }
270 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)271 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
272 {
273    LLVMTypeRef type = LLVMTypeOf(v);
274    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
275 }
276 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)277 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
278                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
279                                 unsigned attrib_mask)
280 {
281    LLVMValueRef call;
282 
283    LLVMTypeRef param_types[32];
284    assert(param_count <= 32);
285    for (unsigned i = 0; i < param_count; ++i) {
286       assert(params[i]);
287       param_types[i] = LLVMTypeOf(params[i]);
288    }
289 
290    LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
291    LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
292 
293    if (!function) {
294       function = LLVMAddFunction(ctx->module, name, function_type);
295 
296       LLVMSetFunctionCallConv(function, LLVMCCallConv);
297       LLVMSetLinkage(function, LLVMExternalLinkage);
298    }
299 
300    call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
301 
302    if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
303       LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
304 
305    if (attrib_mask & AC_ATTR_CONVERGENT)
306       LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
307 
308    LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
309    return call;
310 }
311 
312 /**
313  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
314  * intrinsic names).
315  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)316 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
317 {
318    LLVMTypeRef elem_type = type;
319 
320    if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
321       unsigned count = LLVMCountStructElementTypes(type);
322       int ret = snprintf(buf, bufsize, "sl_");
323       buf += ret;
324       bufsize -= ret;
325 
326       LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
327       LLVMGetStructElementTypes(type, elems);
328 
329       for (unsigned i = 0; i < count; i++) {
330          ac_build_type_name_for_intr(elems[i], buf, bufsize);
331          ret = strlen(buf);
332          buf += ret;
333          bufsize -= ret;
334       }
335 
336       snprintf(buf, bufsize, "s");
337       return;
338    }
339 
340    assert(bufsize >= 8);
341    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
342       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
343       if (ret < 0) {
344          char *type_name = LLVMPrintTypeToString(type);
345          fprintf(stderr, "Error building type name for: %s\n", type_name);
346          LLVMDisposeMessage(type_name);
347          return;
348       }
349       elem_type = LLVMGetElementType(type);
350       buf += ret;
351       bufsize -= ret;
352    }
353    switch (LLVMGetTypeKind(elem_type)) {
354    default:
355       break;
356    case LLVMIntegerTypeKind:
357       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
358       break;
359    case LLVMHalfTypeKind:
360       snprintf(buf, bufsize, "f16");
361       break;
362    case LLVMFloatTypeKind:
363       snprintf(buf, bufsize, "f32");
364       break;
365    case LLVMDoubleTypeKind:
366       snprintf(buf, bufsize, "f64");
367       break;
368    }
369 }
370 
371 /**
372  * Helper function that builds an LLVM IR PHI node and immediately adds
373  * incoming edges.
374  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)375 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
376                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
377 {
378    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
379    LLVMAddIncoming(phi, values, blocks, count_incoming);
380    return phi;
381 }
382 
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)383 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
384 {
385    /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
386     * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
387     */
388    if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
389       return;
390 
391    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
392 }
393 
394 /* Prevent optimizations (at least of memory accesses) across the current
395  * point in the program by emitting empty inline assembly that is marked as
396  * having side effects.
397  *
398  * Optionally, a value can be passed through the inline assembly to prevent
399  * LLVM from hoisting calls to ReadNone functions.
400  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)401 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
402 {
403    static int counter = 0;
404 
405    LLVMBuilderRef builder = ctx->builder;
406    char code[16];
407    const char *constraint = sgpr ? "=s,0" : "=v,0";
408 
409    snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
410 
411    if (!pgpr) {
412       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
413       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
414       LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
415    } else {
416       LLVMTypeRef old_type = LLVMTypeOf(*pgpr);
417 
418       if (old_type == ctx->i1)
419          *pgpr = LLVMBuildZExt(builder, *pgpr, ctx->i32, "");
420 
421       if (old_type == LLVMVectorType(ctx->i16, 3))
422          *pgpr = ac_build_expand_to_vec4(ctx, *pgpr, 4);
423 
424       LLVMTypeRef type = LLVMTypeOf(*pgpr);
425       LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
426       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
427 
428       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
429 
430       if (old_type == ctx->i1)
431          *pgpr = LLVMBuildTrunc(builder, *pgpr, old_type, "");
432 
433       if (old_type == LLVMVectorType(ctx->i16, 3))
434          *pgpr = ac_extract_components(ctx, *pgpr, 0, 3);
435    }
436 }
437 
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)438 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
439 {
440    if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
441       const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
442       LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
443       LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
444       return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
445    }
446 
447    const char *subgroup = "llvm.readcyclecounter";
448    const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
449 
450    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
451    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
452 }
453 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)454 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
455 {
456    const char *name;
457 
458    if (LLVMTypeOf(value) == ctx->i1)
459       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
460 
461    if (ctx->wave_size == 64)
462       name = "llvm.amdgcn.icmp.i64.i32";
463    else
464       name = "llvm.amdgcn.icmp.i32.i32";
465 
466    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
467 
468    /* We currently have no other way to prevent LLVM from lifting the icmp
469     * calls to a dominating basic block.
470     */
471    ac_build_optimization_barrier(ctx, &args[0], false);
472 
473    args[0] = ac_to_integer(ctx, args[0]);
474 
475    return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
476 }
477 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)478 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
479 {
480    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
481    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
482    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
483 }
484 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)485 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
486 {
487    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
488    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
489                         "");
490 }
491 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)492 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
493                                             unsigned value_count, unsigned component)
494 {
495    LLVMValueRef vec = NULL;
496 
497    if (value_count == 1) {
498       return values[component];
499    } else if (!value_count)
500       unreachable("value_count is 0");
501 
502    for (unsigned i = component; i < value_count + component; i++) {
503       LLVMValueRef value = values[i];
504 
505       if (i == component)
506          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
507       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
508       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
509    }
510    return vec;
511 }
512 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)513 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
514                                              unsigned value_count, unsigned value_stride,
515                                              bool always_vector)
516 {
517    LLVMBuilderRef builder = ctx->builder;
518    LLVMValueRef vec = NULL;
519    unsigned i;
520 
521    if (value_count == 1 && !always_vector) {
522       return values[0];
523    } else if (!value_count)
524       unreachable("value_count is 0");
525 
526    for (i = 0; i < value_count; i++) {
527       LLVMValueRef value = values[i * value_stride];
528 
529       if (!i)
530          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
531       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
532       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
533    }
534    return vec;
535 }
536 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)537 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
538                                     unsigned value_count)
539 {
540    return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
541 }
542 
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)543 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
544 {
545    if (!a)
546       return b;
547 
548    unsigned a_size = ac_get_llvm_num_components(a);
549    unsigned b_size = ac_get_llvm_num_components(b);
550 
551    LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
552    for (unsigned i = 0; i < a_size; i++)
553       elems[i] = ac_llvm_extract_elem(ctx, a, i);
554    for (unsigned i = 0; i < b_size; i++)
555       elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
556 
557    return ac_build_gather_values(ctx, elems, a_size + b_size);
558 }
559 
560 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
561  * channels with undef. Extract at most src_channels components from the input.
562  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)563 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
564                              unsigned src_channels, unsigned dst_channels)
565 {
566    LLVMTypeRef elemtype;
567    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
568 
569    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
570       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
571 
572       if (src_channels == dst_channels && vec_size == dst_channels)
573          return value;
574 
575       src_channels = MIN2(src_channels, vec_size);
576 
577       for (unsigned i = 0; i < src_channels; i++)
578          chan[i] = ac_llvm_extract_elem(ctx, value, i);
579 
580       elemtype = LLVMGetElementType(LLVMTypeOf(value));
581    } else {
582       if (src_channels) {
583          assert(src_channels == 1);
584          chan[0] = value;
585       }
586       elemtype = LLVMTypeOf(value);
587    }
588 
589    for (unsigned i = src_channels; i < dst_channels; i++)
590       chan[i] = LLVMGetUndef(elemtype);
591 
592    return ac_build_gather_values(ctx, chan, dst_channels);
593 }
594 
595 /* Extract components [start, start + channels) from a vector.
596  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)597 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
598                                    unsigned channels)
599 {
600    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
601 
602    for (unsigned i = 0; i < channels; i++)
603       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
604 
605    return ac_build_gather_values(ctx, chan, channels);
606 }
607 
608 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
609  * with undef. Extract at most num_channels components from the input.
610  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)611 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
612                                      unsigned num_channels)
613 {
614    return ac_build_expand(ctx, value, num_channels, 4);
615 }
616 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)617 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
618 {
619    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
620    const char *name;
621 
622    if (type_size == 2)
623       name = "llvm.amdgcn.rcp.f16";
624    else if (type_size == 4)
625       name = "llvm.amdgcn.rcp.f32";
626    else
627       name = "llvm.amdgcn.rcp.f64";
628 
629    LLVMValueRef rcp =
630       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
631 
632    return LLVMBuildFMul(ctx->builder, num, rcp, "");
633 }
634 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)635 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
636                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
637                                 LLVMValueRef j)
638 {
639    LLVMValueRef args[5];
640 
641    if (ctx->gfx_level >= GFX11) {
642       LLVMValueRef p;
643       LLVMValueRef p10;
644 
645       args[0] = llvm_chan;
646       args[1] = attr_number;
647       args[2] = params;
648 
649       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
650                              ctx->f32, args, 3, 0);
651 
652       args[0] = p;
653       args[1] = i;
654       args[2] = p;
655 
656       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
657                                ctx->f32, args, 3, 0);
658 
659       args[0] = p;
660       args[1] = j;
661       args[2] = p10;
662 
663       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
664                                 ctx->f32, args, 3, 0);
665 
666    } else {
667       LLVMValueRef p1;
668 
669       args[0] = i;
670       args[1] = llvm_chan;
671       args[2] = attr_number;
672       args[3] = params;
673 
674       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
675                               ctx->f32, args, 4, 0);
676 
677       args[0] = p1;
678       args[1] = j;
679       args[2] = llvm_chan;
680       args[3] = attr_number;
681       args[4] = params;
682 
683       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
684                                 ctx->f32, args, 5, 0);
685    }
686 }
687 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)688 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
689                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
690                                     LLVMValueRef j, bool high_16bits)
691 {
692    LLVMValueRef args[6];
693 
694    if (ctx->gfx_level >= GFX11) {
695       LLVMValueRef p;
696       LLVMValueRef p10;
697 
698       args[0] = llvm_chan;
699       args[1] = attr_number;
700       args[2] = params;
701 
702       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
703                              ctx->f32, args, 3, 0);
704 
705       args[0] = p;
706       args[1] = i;
707       args[2] = p;
708       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
709 
710       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
711                                ctx->f32, args, 4, 0);
712 
713       args[0] = p;
714       args[1] = j;
715       args[2] = p10;
716       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
717 
718       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
719                                 ctx->f16, args, 4, 0);
720 
721    } else {
722       LLVMValueRef p1;
723 
724       args[0] = i;
725       args[1] = llvm_chan;
726       args[2] = attr_number;
727       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
728       args[4] = params;
729 
730       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
731                               0);
732 
733       args[0] = p1;
734       args[1] = j;
735       args[2] = llvm_chan;
736       args[3] = attr_number;
737       args[4] = high_16bits ? ctx->i1true : ctx->i1false;
738       args[5] = params;
739 
740       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
741                                 0);
742    }
743 }
744 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)745 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
746                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
747                                     LLVMValueRef params)
748 {
749    LLVMValueRef args[4];
750 
751    if (ctx->gfx_level >= GFX11) {
752       LLVMValueRef p;
753 
754       args[0] = llvm_chan;
755       args[1] = attr_number;
756       args[2] = params;
757 
758       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
759                              ctx->f32, args, 3, 0);
760       p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
761       p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter, true);
762       return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
763    } else {
764       args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
765       args[1] = llvm_chan;
766       args[2] = attr_number;
767       args[3] = params;
768 
769       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
770    }
771 }
772 
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)773 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
774 {
775    LLVMValueRef indices[2] = {
776       ctx->i32_0,
777       index,
778    };
779 
780    return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
781 }
782 
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)783 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
784                             LLVMValueRef value)
785 {
786    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
787 }
788 
789 /**
790  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
791  * It's equivalent to doing a load from &base_ptr[index].
792  *
793  * \param base_ptr  Where the array starts.
794  * \param index     The element index into the array.
795  * \param uniform   Whether the base_ptr and index can be assumed to be
796  *                  dynamically uniform (i.e. load to an SGPR)
797  * \param invariant Whether the load is invariant (no other opcodes affect it)
798  * \param no_unsigned_wraparound
799  *    For all possible re-associations and re-distributions of an expression
800  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
801  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
802  *    does not result in an unsigned integer wraparound. This is used for
803  *    optimal code generation of 32-bit pointer arithmetic.
804  *
805  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
806  *    integer wraparound can't be an imm offset in s_load_dword, because
807  *    the instruction performs "addr + offset" in 64 bits.
808  *
809  *    Expected usage for bindless textures by chaining GEPs:
810  *      // possible unsigned wraparound, don't use InBounds:
811  *      ptr1 = LLVMBuildGEP(base_ptr, index);
812  *      image = load(ptr1); // becomes "s_load ptr1, 0"
813  *
814  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
815  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
816  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)817 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
818                                          LLVMValueRef base_ptr, LLVMValueRef index,
819                                          bool uniform, bool invariant, bool no_unsigned_wraparound)
820 {
821    LLVMValueRef pointer, result;
822 
823    if (no_unsigned_wraparound &&
824        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
825       pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
826    else
827       pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
828 
829    if (uniform)
830       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
831    result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
832    if (invariant)
833       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
834    LLVMSetAlignment(result, 4);
835    return result;
836 }
837 
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)838 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
839                                      LLVMValueRef index)
840 {
841    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
842 }
843 
844 /* This assumes that there is no unsigned integer wraparound during the address
845  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)846 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
847                                    LLVMValueRef index)
848 {
849    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
850 }
851 
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)852 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
853 {
854    return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
855 }
856 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)857 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
858                                          LLVMValueRef data, LLVMValueRef vindex,
859                                          LLVMValueRef voffset, LLVMValueRef soffset,
860                                          enum gl_access_qualifier access, bool use_format)
861 {
862    LLVMValueRef args[6];
863    int idx = 0;
864    args[idx++] = data;
865    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
866    if (vindex)
867       args[idx++] = vindex ? vindex : ctx->i32_0;
868    args[idx++] = voffset ? voffset : ctx->i32_0;
869    args[idx++] = soffset ? soffset : ctx->i32_0;
870    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
871    const char *indexing_kind = vindex ? "struct" : "raw";
872    char name[256], type_name[8];
873 
874    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
875 
876    if (use_format) {
877       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
878                type_name);
879    } else {
880       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
881    }
882 
883    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
884 }
885 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)886 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
887                                   LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
888 {
889    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
890 }
891 
892 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)893 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
894                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
895                                  enum gl_access_qualifier access)
896 {
897    unsigned num_channels = ac_get_llvm_num_components(vdata);
898 
899    /* Split 3 channel stores if unsupported. */
900    if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
901       LLVMValueRef v[3], v01, voffset2;
902 
903       for (int i = 0; i < 3; i++) {
904          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
905       }
906       v01 = ac_build_gather_values(ctx, v, 2);
907 
908       voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
909                               LLVMConstInt(ctx->i32, 8, 0), "");
910 
911       ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
912       ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
913       return;
914    }
915 
916    ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
917                                 access, false);
918 }
919 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)920 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
921                                                 LLVMValueRef vindex, LLVMValueRef voffset,
922                                                 LLVMValueRef soffset, unsigned num_channels,
923                                                 LLVMTypeRef channel_type, enum gl_access_qualifier access,
924                                                 bool can_speculate, bool use_format)
925 {
926    LLVMValueRef args[5];
927    int idx = 0;
928    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
929    if (vindex)
930       args[idx++] = vindex;
931    args[idx++] = voffset ? voffset : ctx->i32_0;
932    args[idx++] = soffset ? soffset : ctx->i32_0;
933    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
934    unsigned func =
935       !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
936    const char *indexing_kind = vindex ? "struct" : "raw";
937    char name[256], type_name[8];
938 
939    /* D16 is only supported on gfx8+ */
940    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
941           ctx->gfx_level >= GFX8);
942 
943    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
944    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
945 
946    if (use_format) {
947       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
948                type_name);
949    } else {
950       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
951    }
952 
953    LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
954                                             can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
955    if (func > num_channels)
956       result = ac_trim_vector(ctx, result, num_channels);
957    return result;
958 }
959 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)960 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
961                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
962                                   LLVMTypeRef channel_type, enum gl_access_qualifier access,
963                                   bool can_speculate, bool allow_smem)
964 {
965    if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
966       assert(vindex == NULL);
967 
968       LLVMValueRef result[32];
969 
970       LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
971       if (soffset)
972          offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
973 
974       char name[256], type_name[8];
975       ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
976       snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
977 
978       LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
979 
980       for (int i = 0; i < num_channels; i++) {
981          if (i) {
982             offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
983          }
984          LLVMValueRef args[3] = {
985             rsrc,
986             offset,
987             LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
988                                                         ACCESS_TYPE_SMEM), 0),
989          };
990          result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
991       }
992       if (num_channels == 1)
993          return result[0];
994 
995       return ac_build_gather_values(ctx, result, num_channels);
996    }
997 
998    /* LLVM is unable to select instructions for num_channels > 4, so we
999     * workaround that by manually splitting larger buffer loads.
1000     */
1001    LLVMValueRef result = NULL;
1002    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1003       fetch_num_channels = MIN2(4, num_channels - i);
1004       LLVMValueRef fetch_voffset =
1005             LLVMBuildAdd(ctx->builder, voffset,
1006                          LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1007       LLVMValueRef item =
1008          ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1009                                      channel_type, access, can_speculate, false);
1010       result = ac_build_concat(ctx, result, item);
1011    }
1012 
1013    return result;
1014 }
1015 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1016 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1017                                          LLVMValueRef vindex, LLVMValueRef voffset,
1018                                          unsigned num_channels, enum gl_access_qualifier access,
1019                                          bool can_speculate, bool d16, bool tfe)
1020 {
1021    if (tfe) {
1022       assert(!d16);
1023 
1024       union ac_hw_cache_flags cache_flags =
1025          ac_get_hw_cache_flags(ctx->gfx_level, access | ACCESS_TYPE_LOAD);
1026       char code[1024];
1027 
1028       /* The definition in the assembly and the one in the constraint string
1029        * differs because of an assembler bug.
1030        */
1031       if (ctx->gfx_level >= GFX12) {
1032          const char *scope = "";
1033          const char *temporal_hint = "";
1034 
1035          if (cache_flags.gfx12.scope == gfx12_scope_se)
1036             scope = "scope:SCOPE_SE";
1037          else if (cache_flags.gfx12.scope == gfx12_scope_device)
1038             scope = "scope:SCOPE_DEV";
1039          else if (cache_flags.gfx12.scope == gfx12_scope_memory)
1040             scope = "scope:SCOPE_SYS";
1041 
1042          if (cache_flags.gfx12.temporal_hint == gfx12_load_non_temporal)
1043             temporal_hint = "th:TH_LOAD_NT";
1044          else if (cache_flags.gfx12.temporal_hint == gfx12_load_high_temporal)
1045             temporal_hint = "th:TH_LOAD_HT";
1046          else if (cache_flags.gfx12.temporal_hint == gfx12_load_last_use_discard)
1047             temporal_hint = "th:TH_LOAD_LU";
1048          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_regular_temporal)
1049             temporal_hint = "th:TH_LOAD_NT_RT";
1050          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_regular_temporal_far_non_temporal)
1051             temporal_hint = "th:TH_LOAD_RT_NT";
1052          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_high_temporal)
1053             temporal_hint = "th:TH_LOAD_NT_HT";
1054 
1055          snprintf(code, sizeof(code),
1056                   "v_mov_b32 v0, 0\n"
1057                   "v_mov_b32 v1, 0\n"
1058                   "v_mov_b32 v2, 0\n"
1059                   "v_mov_b32 v3, 0\n"
1060                   "v_mov_b32 v4, 0\n"
1061                   "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe\n"
1062                   "s_waitcnt vmcnt(0)",
1063                   temporal_hint, scope);
1064       } else {
1065          snprintf(code, sizeof(code),
1066                   "v_mov_b32 v0, 0\n"
1067                   "v_mov_b32 v1, 0\n"
1068                   "v_mov_b32 v2, 0\n"
1069                   "v_mov_b32 v3, 0\n"
1070                   "v_mov_b32 v4, 0\n"
1071                   "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1072                   "s_waitcnt vmcnt(0)",
1073                   cache_flags.value & ac_glc ? "glc" : "",
1074                   cache_flags.value & ac_slc ? "slc" : "",
1075                   cache_flags.value & ac_dlc ? "dlc" : "");
1076       }
1077 
1078       LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1079       LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1080       LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1081 
1082       LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1083                                    voffset ? voffset : ctx->i32_0};
1084 
1085       LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1086                              LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1087       LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1088 
1089       return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1090                              ac_llvm_extract_elem(ctx, res, 4));
1091    }
1092 
1093    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1094                                       num_channels, d16 ? ctx->f16 : ctx->f32, access,
1095                                       can_speculate, true);
1096 }
1097 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1098 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1099                                           LLVMValueRef vindex, LLVMValueRef voffset,
1100                                           LLVMValueRef soffset, unsigned num_channels,
1101                                           unsigned tbuffer_format, LLVMTypeRef channel_type,
1102                                           enum gl_access_qualifier access, bool can_speculate)
1103 {
1104    LLVMValueRef args[6];
1105    int idx = 0;
1106    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1107    if (vindex)
1108       args[idx++] = vindex;
1109    args[idx++] = voffset ? voffset : ctx->i32_0;
1110    args[idx++] = soffset ? soffset : ctx->i32_0;
1111    args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1112    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1113    const char *indexing_kind = vindex ? "struct" : "raw";
1114    char name[256], type_name[8];
1115 
1116    LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1117    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1118 
1119    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1120 
1121    return ac_build_intrinsic(ctx, name, type, args, idx,
1122                              can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1123 }
1124 
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1125 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1126                                         LLVMValueRef vidx, LLVMValueRef base_voffset,
1127                                         LLVMValueRef soffset,
1128                                         const enum pipe_format format,
1129                                         unsigned channel_bit_size,
1130                                         unsigned const_offset,
1131                                         unsigned align_offset,
1132                                         unsigned align_mul,
1133                                         unsigned num_channels,
1134                                         enum gl_access_qualifier access,
1135                                         bool can_speculate)
1136 {
1137    const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1138    const unsigned max_channels = vtx_info->num_channels;
1139    LLVMValueRef voffset_plus_const =
1140       LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1141 
1142    /* Split the specified load into several MTBUF instructions,
1143     * according to a safe fetch size determined by aligmnent information.
1144     */
1145    LLVMValueRef result = NULL;
1146    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1147       /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1148       assert(i == 0 || vtx_info->chan_byte_size);
1149 
1150       const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1151       const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1152       const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1153 
1154       fetch_num_channels =
1155          ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1156                                 max_channels - i, fetch_alignment, num_channels - i);
1157       const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1158       LLVMValueRef fetch_voffset =
1159             LLVMBuildAdd(ctx->builder, voffset_plus_const,
1160                          LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1161       LLVMValueRef item =
1162          ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1163                                fetch_num_channels, fetch_format, ctx->i32,
1164                                access, can_speculate);
1165       result = ac_build_concat(ctx, result, item);
1166    }
1167 
1168    /*
1169     * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1170     * manually truncate them to the required size.
1171     * TODO: Do this in NIR instead.
1172     */
1173    const struct util_format_description *desc = util_format_description(format);
1174    bool is_float = !desc->channel[0].pure_integer;
1175 
1176    if (channel_bit_size == 16) {
1177       LLVMValueRef channels[4];
1178       for (unsigned i = 0; i < num_channels; i++) {
1179          LLVMValueRef channel = result;
1180          if (num_channels > 1)
1181             channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1182 
1183          if (is_float) {
1184             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1185             channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1186             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1187          } else {
1188             channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1189          }
1190          channels[i] = channel;
1191       }
1192       result = ac_build_gather_values(ctx, channels, num_channels);
1193    }
1194 
1195    return result;
1196 }
1197 
1198 
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1199 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1200                                         LLVMValueRef voffset, LLVMValueRef soffset,
1201                                         enum gl_access_qualifier access)
1202 {
1203    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1204                                       access, false, false);
1205 }
1206 
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1207 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1208                                        LLVMValueRef voffset, LLVMValueRef soffset,
1209                                        enum gl_access_qualifier access)
1210 {
1211    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1212                                       false, false);
1213 }
1214 
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1215 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1216                                  LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1217                                  enum gl_access_qualifier access)
1218 {
1219    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1220 
1221    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1222 }
1223 
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1224 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1225                                 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1226 {
1227    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1228 
1229    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1230 }
1231 
1232 /**
1233  * Set range metadata on an instruction.  This can only be used on load and
1234  * call instructions.  If you know an instruction can only produce the values
1235  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1236  * \p lo is the minimum value inclusive.
1237  * \p hi is the maximum value exclusive.
1238  */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1239 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1240                            unsigned hi)
1241 {
1242    LLVMValueRef range_md, md_args[2];
1243    LLVMTypeRef type = LLVMTypeOf(value);
1244    LLVMContextRef context = LLVMGetTypeContext(type);
1245 
1246    md_args[0] = LLVMConstInt(type, lo, false);
1247    md_args[1] = LLVMConstInt(type, hi, false);
1248    range_md = LLVMMDNodeInContext(context, md_args, 2);
1249    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1250 }
1251 
ac_get_thread_id(struct ac_llvm_context * ctx)1252 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1253 {
1254    return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1255 }
1256 
1257 /*
1258  * AMD GCN implements derivatives using the local data store (LDS)
1259  * All writes to the LDS happen in all executing threads at
1260  * the same time. TID is the Thread ID for the current
1261  * thread and is a value between 0 and 63, representing
1262  * the thread's position in the wavefront.
1263  *
1264  * For the pixel shader threads are grouped into quads of four pixels.
1265  * The TIDs of the pixels of a quad are:
1266  *
1267  *  +------+------+
1268  *  |4n + 0|4n + 1|
1269  *  +------+------+
1270  *  |4n + 2|4n + 3|
1271  *  +------+------+
1272  *
1273  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1274  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1275  * the current pixel's column, and masking with 0xfffffffe yields the TID
1276  * of the left pixel of the current pixel's row.
1277  *
1278  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1279  * adding 2 yields the TID of the pixel below the top pixel.
1280  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1281 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1282 {
1283    unsigned tl_lanes[4], trbl_lanes[4];
1284    char name[32], type[8];
1285    LLVMValueRef tl, trbl;
1286    LLVMTypeRef result_type;
1287    LLVMValueRef result;
1288 
1289    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1290 
1291    if (result_type == ctx->f16)
1292       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1293    else if (result_type == ctx->v2f16)
1294       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1295 
1296    for (unsigned i = 0; i < 4; ++i) {
1297       tl_lanes[i] = i & mask;
1298       trbl_lanes[i] = (i & mask) + idx;
1299    }
1300 
1301    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3], false);
1302    trbl =
1303       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3], false);
1304 
1305    if (result_type == ctx->f16) {
1306       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1307       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1308    }
1309 
1310    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1311    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1312    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1313 
1314    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1315    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1316 
1317    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1318 }
1319 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1320 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1321 {
1322    LLVMValueRef args[2];
1323    args[0] = LLVMConstInt(ctx->i32, imm, false);
1324    args[1] = m0_content;
1325    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1326 }
1327 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1328 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1329 {
1330    LLVMValueRef msb =
1331       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1332 
1333    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1334     * the index from LSB. Invert it by doing "31 - msb". */
1335    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1336 
1337    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1338    LLVMValueRef cond =
1339       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1340                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1341 
1342    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1343 }
1344 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1345 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1346                            bool rev)
1347 {
1348    const char *intrin_name;
1349    LLVMTypeRef type;
1350    LLVMValueRef highest_bit;
1351    LLVMValueRef zero;
1352    unsigned bitsize;
1353 
1354    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1355    switch (bitsize) {
1356    case 64:
1357       intrin_name = "llvm.ctlz.i64";
1358       type = ctx->i64;
1359       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1360       zero = ctx->i64_0;
1361       break;
1362    case 32:
1363       intrin_name = "llvm.ctlz.i32";
1364       type = ctx->i32;
1365       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1366       zero = ctx->i32_0;
1367       break;
1368    case 16:
1369       intrin_name = "llvm.ctlz.i16";
1370       type = ctx->i16;
1371       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1372       zero = ctx->i16_0;
1373       break;
1374    case 8:
1375       intrin_name = "llvm.ctlz.i8";
1376       type = ctx->i8;
1377       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1378       zero = ctx->i8_0;
1379       break;
1380    default:
1381       unreachable("invalid bitsize");
1382       break;
1383    }
1384 
1385    LLVMValueRef params[2] = {
1386       arg,
1387       ctx->i1true,
1388    };
1389 
1390    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1391 
1392    if (!rev) {
1393       /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1394        * the index from LSB. Invert it by doing "31 - msb". */
1395       msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1396    }
1397 
1398    if (bitsize == 64) {
1399       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1400    } else if (bitsize < 32) {
1401       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1402    }
1403 
1404    /* check for zero */
1405    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1406                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1407 }
1408 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1409 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1410 {
1411    char name[64], type[64];
1412 
1413    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1414    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1415    LLVMValueRef args[2] = {a, b};
1416    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1417 }
1418 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1419 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1420 {
1421    char name[64], type[64];
1422 
1423    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1424    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1425    LLVMValueRef args[2] = {a, b};
1426    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1427 }
1428 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1429 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1430 {
1431    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1432    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1433 }
1434 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1435 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1436 {
1437    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1438    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1439 }
1440 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1441 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1442 {
1443    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1444    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1445 }
1446 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1447 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1448 {
1449    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1450    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1451 }
1452 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1453 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1454 {
1455    LLVMTypeRef t = LLVMTypeOf(value);
1456    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1457                         LLVMConstReal(t, 1.0));
1458 }
1459 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1460 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1461 {
1462    LLVMValueRef args[9];
1463 
1464    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1465    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1466 
1467    if (a->compr) {
1468       assert(ctx->gfx_level < GFX11);
1469 
1470       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1471       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1472       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1473       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1474 
1475       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1476    } else {
1477       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1478       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1479       args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1480       args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1481       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1482       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1483 
1484       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1485    }
1486 }
1487 
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1488 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1489 {
1490    struct ac_export_args args;
1491 
1492    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1493     * for discard.
1494     */
1495    if (ctx->gfx_level >= GFX10 && !uses_discard)
1496       return;
1497 
1498    args.enabled_channels = 0x0; /* enabled channels */
1499    args.valid_mask = 1;         /* whether the EXEC mask is valid */
1500    args.done = 1;               /* DONE bit */
1501    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1502    args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1503    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
1504    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1505    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1506    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1507    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1508 
1509    ac_build_export(ctx, &args);
1510 }
1511 
ac_num_coords(enum ac_image_dim dim)1512 static unsigned ac_num_coords(enum ac_image_dim dim)
1513 {
1514    switch (dim) {
1515    case ac_image_1d:
1516       return 1;
1517    case ac_image_2d:
1518    case ac_image_1darray:
1519       return 2;
1520    case ac_image_3d:
1521    case ac_image_cube:
1522    case ac_image_2darray:
1523    case ac_image_2dmsaa:
1524       return 3;
1525    case ac_image_2darraymsaa:
1526       return 4;
1527    default:
1528       unreachable("ac_num_coords: bad dim");
1529    }
1530 }
1531 
ac_num_derivs(enum ac_image_dim dim)1532 static unsigned ac_num_derivs(enum ac_image_dim dim)
1533 {
1534    switch (dim) {
1535    case ac_image_1d:
1536    case ac_image_1darray:
1537       return 2;
1538    case ac_image_2d:
1539    case ac_image_2darray:
1540    case ac_image_cube:
1541       return 4;
1542    case ac_image_3d:
1543       return 6;
1544    case ac_image_2dmsaa:
1545    case ac_image_2darraymsaa:
1546    default:
1547       unreachable("derivatives not supported");
1548    }
1549 }
1550 
get_atomic_name(enum ac_atomic_op op)1551 static const char *get_atomic_name(enum ac_atomic_op op)
1552 {
1553    switch (op) {
1554    case ac_atomic_swap:
1555       return "swap";
1556    case ac_atomic_add:
1557       return "add";
1558    case ac_atomic_sub:
1559       return "sub";
1560    case ac_atomic_smin:
1561       return "smin";
1562    case ac_atomic_umin:
1563       return "umin";
1564    case ac_atomic_smax:
1565       return "smax";
1566    case ac_atomic_umax:
1567       return "umax";
1568    case ac_atomic_and:
1569       return "and";
1570    case ac_atomic_or:
1571       return "or";
1572    case ac_atomic_xor:
1573       return "xor";
1574    case ac_atomic_inc_wrap:
1575       return "inc";
1576    case ac_atomic_dec_wrap:
1577       return "dec";
1578    case ac_atomic_fmin:
1579       return "fmin";
1580    case ac_atomic_fmax:
1581       return "fmax";
1582    }
1583    unreachable("bad atomic op");
1584 }
1585 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1586 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1587 {
1588    const char *overload[3] = {"", "", ""};
1589    unsigned num_overloads = 0;
1590    LLVMValueRef args[18];
1591    unsigned num_args = 0;
1592    enum ac_image_dim dim = a->dim;
1593 
1594    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1595    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1596            a->opcode != ac_image_store_mip) ||
1597           a->lod);
1598    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1599           (!a->compare && !a->offset));
1600    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1601            a->opcode == ac_image_get_lod) ||
1602           !a->bias);
1603    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1604           1);
1605    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1606    assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1607                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1608                       a->opcode != ac_image_get_resinfo));
1609    assert(!a->a16 || ctx->gfx_level >= GFX9);
1610    assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1611 
1612    assert(!a->offset ||
1613           ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1614    assert(!a->bias ||
1615           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == (a->a16 ? 16 : 32));
1616    assert(!a->compare ||
1617           ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1618    assert(!a->derivs[0] ||
1619           ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1620            (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1621    assert(!a->coords[0] ||
1622           ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1623            (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1624    assert(!a->lod ||
1625           ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1626            (a->opcode == ac_image_get_resinfo ||
1627             ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1628             ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1629    assert(!a->min_lod ||
1630           ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1631           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1632 
1633    if (a->opcode == ac_image_get_lod) {
1634       switch (dim) {
1635       case ac_image_1darray:
1636          dim = ac_image_1d;
1637          break;
1638       case ac_image_2darray:
1639       case ac_image_cube:
1640          dim = ac_image_2d;
1641          break;
1642       default:
1643          break;
1644       }
1645    }
1646 
1647    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1648                  a->opcode == ac_image_get_lod;
1649    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1650    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1651                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1652    LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1653    uint8_t dmask = a->dmask;
1654    LLVMTypeRef data_type;
1655    char data_type_str[32];
1656 
1657    if (atomic) {
1658       data_type = LLVMTypeOf(a->data[0]);
1659    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1660       /* Image stores might have been shrunk using the format. */
1661       data_type = LLVMTypeOf(a->data[0]);
1662       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1663    } else {
1664       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1665    }
1666 
1667    if (a->tfe) {
1668       data_type = LLVMStructTypeInContext(
1669          ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1670    }
1671 
1672    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1673       args[num_args++] = a->data[0];
1674       if (a->opcode == ac_image_atomic_cmpswap)
1675          args[num_args++] = a->data[1];
1676    }
1677 
1678    if (!atomic)
1679       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1680 
1681    if (a->offset)
1682       args[num_args++] = ac_to_integer(ctx, a->offset);
1683    if (a->bias) {
1684       args[num_args++] = ac_to_float(ctx, a->bias);
1685       overload[num_overloads++] = ".f32";
1686    }
1687    if (a->compare)
1688       args[num_args++] = ac_to_float(ctx, a->compare);
1689    if (a->derivs[0]) {
1690       unsigned count = ac_num_derivs(dim);
1691       for (unsigned i = 0; i < count; ++i)
1692          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1693       overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1694    }
1695    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1696    for (unsigned i = 0; i < num_coords; ++i)
1697       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1698    if (a->lod)
1699       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1700    if (a->min_lod)
1701       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1702 
1703    overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1704 
1705    args[num_args++] = a->resource;
1706    if (sample) {
1707       args[num_args++] = a->sampler;
1708       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1709    }
1710 
1711    args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1712    args[num_args++] = LLVMConstInt(
1713       ctx->i32, get_cache_flags(ctx,
1714                                 a->access |
1715                                 (atomic ? ACCESS_TYPE_ATOMIC :
1716                                  load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1717       false);
1718 
1719    const char *name;
1720    const char *atomic_subop = "";
1721    switch (a->opcode) {
1722    case ac_image_sample:
1723       name = "sample";
1724       break;
1725    case ac_image_gather4:
1726       name = "gather4";
1727       break;
1728    case ac_image_load:
1729       name = "load";
1730       break;
1731    case ac_image_load_mip:
1732       name = "load.mip";
1733       break;
1734    case ac_image_store:
1735       name = "store";
1736       break;
1737    case ac_image_store_mip:
1738       name = "store.mip";
1739       break;
1740    case ac_image_atomic:
1741       name = "atomic.";
1742       atomic_subop = get_atomic_name(a->atomic);
1743       break;
1744    case ac_image_atomic_cmpswap:
1745       name = "atomic.";
1746       atomic_subop = "cmpswap";
1747       break;
1748    case ac_image_get_lod:
1749       name = "getlod";
1750       break;
1751    case ac_image_get_resinfo:
1752       name = "getresinfo";
1753       break;
1754    default:
1755       unreachable("invalid image opcode");
1756    }
1757 
1758    const char *dimname;
1759    switch (dim) {
1760    case ac_image_1d:
1761       dimname = "1d";
1762       break;
1763    case ac_image_2d:
1764       dimname = "2d";
1765       break;
1766    case ac_image_3d:
1767       dimname = "3d";
1768       break;
1769    case ac_image_cube:
1770       dimname = "cube";
1771       break;
1772    case ac_image_1darray:
1773       dimname = "1darray";
1774       break;
1775    case ac_image_2darray:
1776       dimname = "2darray";
1777       break;
1778    case ac_image_2dmsaa:
1779       dimname = "2dmsaa";
1780       break;
1781    case ac_image_2darraymsaa:
1782       dimname = "2darraymsaa";
1783       break;
1784    default:
1785       unreachable("invalid dim");
1786    }
1787 
1788    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1789 
1790    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1791    char intr_name[96];
1792    snprintf(intr_name, sizeof(intr_name),
1793             "llvm.amdgcn.image.%s%s" /* base name */
1794             "%s%s%s%s"               /* sample/gather modifiers */
1795             ".%s.%s%s%s%s",          /* dimension and type overloads */
1796             name, atomic_subop, a->compare ? ".c" : "",
1797             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1798             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1799             data_type_str, overload[0], overload[1], overload[2]);
1800 
1801    LLVMTypeRef retty;
1802    if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1803       retty = ctx->voidt;
1804    else
1805       retty = data_type;
1806 
1807    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1808    if (a->tfe) {
1809       LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1810       LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1811       result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1812    }
1813 
1814    if (!sample && !atomic && retty != ctx->voidt)
1815       result = ac_to_integer(ctx, result);
1816 
1817    return result;
1818 }
1819 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1820 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1821 {
1822    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1823 }
1824 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1825 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1826 {
1827    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1828    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1829 }
1830 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1831 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1832 {
1833    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1834    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1835 }
1836 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1837 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1838                                          LLVMValueRef args[2])
1839 {
1840    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1841    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1842    LLVMValueRef code = LLVMConstInlineAsm(calltype,
1843                                           ctx->gfx_level >= GFX11 ?
1844                                              "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1845                                              "v_cvt_pknorm_i16_f16 $0, $1, $2",
1846                                           "=v,v,v", false, false);
1847    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1848 }
1849 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1850 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
1851                                          LLVMValueRef args[2])
1852 {
1853    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1854    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1855    LLVMValueRef code = LLVMConstInlineAsm(calltype,
1856                                           ctx->gfx_level >= GFX11 ?
1857                                              "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
1858                                              "v_cvt_pknorm_u16_f16 $0, $1, $2",
1859                                           "=v,v,v", false, false);
1860    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1861 }
1862 
1863 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)1864 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
1865                                  bool hi)
1866 {
1867    assert(bits == 8 || bits == 10 || bits == 16);
1868 
1869    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
1870    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
1871    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
1872    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1873 
1874    /* Clamp. */
1875    if (bits != 16) {
1876       for (int i = 0; i < 2; i++) {
1877          bool alpha = hi && i == 1;
1878          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
1879          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
1880       }
1881    }
1882 
1883    LLVMValueRef res =
1884       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
1885    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1886 }
1887 
1888 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)1889 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
1890                                  bool hi)
1891 {
1892    assert(bits == 8 || bits == 10 || bits == 16);
1893 
1894    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
1895    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1896 
1897    /* Clamp. */
1898    if (bits != 16) {
1899       for (int i = 0; i < 2; i++) {
1900          bool alpha = hi && i == 1;
1901          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
1902       }
1903    }
1904 
1905    LLVMValueRef res =
1906       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
1907    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1908 }
1909 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)1910 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
1911 {
1912    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
1913 }
1914 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)1915 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
1916 {
1917    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
1918 }
1919 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)1920 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
1921                           LLVMValueRef width, bool is_signed)
1922 {
1923    LLVMValueRef args[] = {
1924       input,
1925       offset,
1926       width,
1927    };
1928 
1929    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
1930                              ctx->i32, args, 3, 0);
1931 }
1932 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)1933 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
1934                            LLVMValueRef s2)
1935 {
1936    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
1937 }
1938 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)1939 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
1940                            LLVMValueRef s2)
1941 {
1942    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
1943    if (ctx->gfx_level >= GFX10)
1944       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
1945 
1946    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
1947 }
1948 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)1949 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
1950 {
1951    if (!wait_flags)
1952       return;
1953 
1954    if (ctx->gfx_level >= GFX12) {
1955       if (wait_flags & AC_WAIT_DS)
1956          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
1957       if (wait_flags & AC_WAIT_KM)
1958          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1959       if (wait_flags & AC_WAIT_EXP)
1960          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1961       if (wait_flags & AC_WAIT_LOAD)
1962          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1963       if (wait_flags & AC_WAIT_STORE)
1964          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
1965       if (wait_flags & AC_WAIT_SAMPLE)
1966          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
1967       if (wait_flags & AC_WAIT_BVH)
1968          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
1969    } else {
1970       unsigned expcnt = 7;
1971       unsigned lgkmcnt = 63;
1972       unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
1973       unsigned vscnt = 63;
1974 
1975       if (wait_flags & AC_WAIT_EXP)
1976          expcnt = 0;
1977       if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
1978          lgkmcnt = 0;
1979       if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
1980          vmcnt = 0;
1981 
1982       if (wait_flags & AC_WAIT_STORE) {
1983          if (ctx->gfx_level >= GFX10)
1984             vscnt = 0;
1985          else
1986             vmcnt = 0;
1987       }
1988 
1989       /* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
1990       if (vscnt == 0) {
1991          assert(!(wait_flags & AC_WAIT_EXP));
1992          LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
1993          return;
1994       }
1995 
1996       unsigned simm16;
1997 
1998       if (ctx->gfx_level >= GFX11)
1999          simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2000       else
2001          simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2002 
2003       LLVMValueRef args[1] = {
2004          LLVMConstInt(ctx->i32, simm16, false),
2005       };
2006       ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2007    }
2008 }
2009 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2010 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2011                            LLVMTypeRef type)
2012 {
2013    unsigned bitsize = ac_get_elem_bits(ctx, type);
2014    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2015    LLVMValueRef one = LLVMConstReal(type, 1.0);
2016    LLVMValueRef result;
2017 
2018    if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2019       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2020        * doesn't expose an intrinsic.
2021        */
2022       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2023    } else {
2024       LLVMTypeRef type;
2025       char *intr;
2026 
2027       if (bitsize == 16) {
2028          intr = "llvm.amdgcn.fmed3.f16";
2029          type = ctx->f16;
2030       } else {
2031          assert(bitsize == 32);
2032          intr = "llvm.amdgcn.fmed3.f32";
2033          type = ctx->f32;
2034       }
2035 
2036       LLVMValueRef params[] = {
2037          zero,
2038          one,
2039          src,
2040       };
2041 
2042       result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2043    }
2044 
2045    if (ctx->gfx_level < GFX9 && bitsize == 32) {
2046       /* Only pre-GFX9 chips do not flush denorms. */
2047       result = ac_build_canonicalize(ctx, result, bitsize);
2048    }
2049 
2050    return result;
2051 }
2052 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2053 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2054 {
2055    LLVMTypeRef type;
2056    char *intr;
2057 
2058    if (bitsize == 16) {
2059       intr = "llvm.amdgcn.fract.f16";
2060       type = ctx->f16;
2061    } else if (bitsize == 32) {
2062       intr = "llvm.amdgcn.fract.f32";
2063       type = ctx->f32;
2064    } else {
2065       intr = "llvm.amdgcn.fract.f64";
2066       type = ctx->f64;
2067    }
2068 
2069    LLVMValueRef params[] = {
2070       src0,
2071    };
2072    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2073 }
2074 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2075 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2076 {
2077 
2078    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2079       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2080       unsigned vec_size = LLVMGetVectorSize(type);
2081       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2082 
2083       for (unsigned i = 0; i < vec_size; i++)
2084          scalars[i] = scalar;
2085       return LLVMConstVector(scalars, vec_size);
2086    }
2087    return LLVMConstInt(type, value, 0);
2088 }
2089 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2090 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2091 {
2092    LLVMTypeRef type = LLVMTypeOf(src0);
2093    LLVMValueRef val;
2094 
2095    /* v_med3 is selected only when max is first. (LLVM bug?) */
2096    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2097    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2098 }
2099 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2100 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2101 {
2102    ac_enable_signed_zeros(ctx);
2103    /* (val + 0) converts negative zero to positive zero. */
2104    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2105    ac_disable_signed_zeros(ctx);
2106    return val;
2107 }
2108 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2109 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2110 {
2111    LLVMTypeRef type = LLVMTypeOf(src);
2112    LLVMValueRef pos, neg, dw[2], val;
2113    unsigned bitsize = ac_get_elem_bits(ctx, type);
2114 
2115    /* The standard version leads to this:
2116     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2117     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2118     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2119     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2120     *
2121     * The isign version:
2122     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2123     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2124     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2125     *
2126     * (src0 + 0) converts negative zero to positive zero.
2127     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2128     *
2129     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2130     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2131     */
2132    if (bitsize == 16 || bitsize == 32) {
2133       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2134       val = ac_build_isign(ctx, val);
2135       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2136    }
2137 
2138    assert(bitsize == 64);
2139    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2140    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2141    dw[0] = ctx->i32_0;
2142    dw[1] = LLVMBuildSelect(
2143       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2144       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2145       "");
2146    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2147 }
2148 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2149 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2150 {
2151    LLVMValueRef result;
2152    unsigned bitsize;
2153 
2154    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2155 
2156    switch (bitsize) {
2157    case 128:
2158       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2159       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2160       break;
2161    case 64:
2162       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2163       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2164       break;
2165    case 32:
2166       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2167       break;
2168    case 16:
2169       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2170       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2171       break;
2172    case 8:
2173       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2174       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2175       break;
2176    default:
2177       unreachable("invalid bitsize");
2178       break;
2179    }
2180 
2181    return result;
2182 }
2183 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2184 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2185 {
2186    LLVMValueRef result;
2187    unsigned bitsize;
2188 
2189    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2190 
2191    switch (bitsize) {
2192    case 64:
2193       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2194       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2195       break;
2196    case 32:
2197       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2198       break;
2199    case 16:
2200       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2201       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2202       break;
2203    case 8:
2204       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2205       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2206       break;
2207    default:
2208       unreachable("invalid bitsize");
2209       break;
2210    }
2211 
2212    return result;
2213 }
2214 
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2215 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2216                                 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2217 {
2218    const char *name = "llvm.amdgcn.sudot4";
2219    LLVMValueRef src[6];
2220 
2221    src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2222    src[1] = s0;
2223    src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2224    src[3] = s1;
2225    src[4] = s2;
2226    src[5] = LLVMConstInt(ctx->i1, clamp, false);
2227 
2228    return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2229 }
2230 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2231 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2232 {
2233    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2234    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2235 }
2236 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2237 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2238 {
2239    unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2240    LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2241    ctx->lds = (struct ac_llvm_pointer) {
2242       .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2243                   LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2244       .pointee_type = type
2245    };
2246 }
2247 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2248 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2249 {
2250    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2251    const char *intrin_name;
2252    LLVMTypeRef type;
2253    LLVMValueRef zero;
2254 
2255    switch (src0_bitsize) {
2256    case 64:
2257       intrin_name = "llvm.cttz.i64";
2258       type = ctx->i64;
2259       zero = ctx->i64_0;
2260       break;
2261    case 32:
2262       intrin_name = "llvm.cttz.i32";
2263       type = ctx->i32;
2264       zero = ctx->i32_0;
2265       break;
2266    case 16:
2267       intrin_name = "llvm.cttz.i16";
2268       type = ctx->i16;
2269       zero = ctx->i16_0;
2270       break;
2271    case 8:
2272       intrin_name = "llvm.cttz.i8";
2273       type = ctx->i8;
2274       zero = ctx->i8_0;
2275       break;
2276    default:
2277       unreachable("invalid bitsize");
2278    }
2279 
2280    LLVMValueRef params[2] = {
2281       src0,
2282 
2283       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2284        * add special code to check for x=0. The reason is that
2285        * the LLVM behavior for x=0 is different from what we
2286        * need here. However, LLVM also assumes that ffs(x) is
2287        * in [0, 31], but GLSL expects that ffs(0) = -1, so
2288        * a conditional assignment to handle 0 is still required.
2289        *
2290        * The hardware already implements the correct behavior.
2291        */
2292       ctx->i1true,
2293    };
2294 
2295    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2296 
2297    if (src0_bitsize == 64) {
2298       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2299    } else if (src0_bitsize < 32) {
2300       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2301    }
2302 
2303    /* TODO: We need an intrinsic to skip this conditional. */
2304    /* Check for zero: */
2305    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2306                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2307 }
2308 
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2309 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2310    switch (type) {
2311    case AC_ARG_CONST_PTR:
2312       return ctx->i8;
2313       break;
2314    case AC_ARG_CONST_FLOAT_PTR:
2315       return ctx->f32;
2316       break;
2317    case AC_ARG_CONST_PTR_PTR:
2318       return ac_array_in_const32_addr_space(ctx->i8);
2319       break;
2320    case AC_ARG_CONST_DESC_PTR:
2321       return ctx->v4i32;
2322       break;
2323    case AC_ARG_CONST_IMAGE_PTR:
2324       return ctx->v8i32;
2325    default:
2326       /* Other ac_arg_type values aren't pointers. */
2327       assert(false);
2328       return NULL;
2329    }
2330 }
2331 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2332 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2333 {
2334    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2335 }
2336 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2337 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2338 {
2339    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2340 }
2341 
get_current_flow(struct ac_llvm_context * ctx)2342 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2343 {
2344    if (ctx->flow->depth > 0)
2345       return &ctx->flow->stack[ctx->flow->depth - 1];
2346    return NULL;
2347 }
2348 
get_innermost_loop(struct ac_llvm_context * ctx)2349 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2350 {
2351    for (unsigned i = ctx->flow->depth; i > 0; --i) {
2352       if (ctx->flow->stack[i - 1].loop_entry_block)
2353          return &ctx->flow->stack[i - 1];
2354    }
2355    return NULL;
2356 }
2357 
push_flow(struct ac_llvm_context * ctx)2358 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2359 {
2360    struct ac_llvm_flow *flow;
2361 
2362    if (ctx->flow->depth >= ctx->flow->depth_max) {
2363       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2364 
2365       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2366       ctx->flow->depth_max = new_max;
2367    }
2368 
2369    flow = &ctx->flow->stack[ctx->flow->depth];
2370    ctx->flow->depth++;
2371 
2372    flow->next_block = NULL;
2373    flow->loop_entry_block = NULL;
2374    return flow;
2375 }
2376 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2377 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2378 {
2379    char buf[32];
2380    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2381    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2382 }
2383 
2384 /* Append a basic block at the level of the parent flow.
2385  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2386 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2387 {
2388    assert(ctx->flow->depth >= 1);
2389 
2390    if (ctx->flow->depth >= 2) {
2391       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2392 
2393       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2394    }
2395 
2396    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2397    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2398 }
2399 
2400 /* Emit a branch to the given default target for the current block if
2401  * applicable -- that is, if the current block does not already contain a
2402  * branch from a break or continue.
2403  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2404 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2405 {
2406    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2407       LLVMBuildBr(builder, target);
2408 }
2409 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2410 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2411 {
2412    struct ac_llvm_flow *flow = push_flow(ctx);
2413    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2414    flow->next_block = append_basic_block(ctx, "ENDLOOP");
2415    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2416    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2417    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2418 }
2419 
ac_build_break(struct ac_llvm_context * ctx)2420 void ac_build_break(struct ac_llvm_context *ctx)
2421 {
2422    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2423    LLVMBuildBr(ctx->builder, flow->next_block);
2424 }
2425 
ac_build_continue(struct ac_llvm_context * ctx)2426 void ac_build_continue(struct ac_llvm_context *ctx)
2427 {
2428    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2429    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2430 }
2431 
ac_build_else(struct ac_llvm_context * ctx,int label_id)2432 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2433 {
2434    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2435    LLVMBasicBlockRef endif_block;
2436 
2437    assert(!current_branch->loop_entry_block);
2438 
2439    endif_block = append_basic_block(ctx, "ENDIF");
2440    emit_default_branch(ctx->builder, endif_block);
2441 
2442    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2443    set_basicblock_name(current_branch->next_block, "else", label_id);
2444 
2445    current_branch->next_block = endif_block;
2446 }
2447 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2448 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2449 {
2450    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2451 
2452    assert(!current_branch->loop_entry_block);
2453 
2454    emit_default_branch(ctx->builder, current_branch->next_block);
2455    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2456    set_basicblock_name(current_branch->next_block, "endif", label_id);
2457 
2458    ctx->flow->depth--;
2459 }
2460 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2461 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2462 {
2463    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2464 
2465    assert(current_loop->loop_entry_block);
2466 
2467    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2468 
2469    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2470    set_basicblock_name(current_loop->next_block, "endloop", label_id);
2471    ctx->flow->depth--;
2472 }
2473 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2474 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2475 {
2476    struct ac_llvm_flow *flow = push_flow(ctx);
2477    LLVMBasicBlockRef if_block;
2478 
2479    if_block = append_basic_block(ctx, "IF");
2480    flow->next_block = append_basic_block(ctx, "ELSE");
2481    set_basicblock_name(if_block, "if", label_id);
2482    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2483    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2484 }
2485 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2486 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2487 {
2488    LLVMBuilderRef builder = ac->builder;
2489    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2490    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2491    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2492    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2493    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2494    LLVMValueRef res;
2495 
2496    if (first_instr) {
2497       LLVMPositionBuilderBefore(first_builder, first_instr);
2498    } else {
2499       LLVMPositionBuilderAtEnd(first_builder, first_block);
2500    }
2501 
2502    res = LLVMBuildAlloca(first_builder, type, name);
2503    LLVMDisposeBuilder(first_builder);
2504    return res;
2505 }
2506 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2507 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2508 {
2509    unsigned num_components = ac_get_llvm_num_components(value);
2510    if (count == num_components)
2511       return value;
2512 
2513    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2514    masks[0] = ctx->i32_0;
2515    masks[1] = ctx->i32_1;
2516    for (unsigned i = 2; i < count; i++)
2517       masks[i] = LLVMConstInt(ctx->i32, i, false);
2518 
2519    if (count == 1)
2520       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2521 
2522    LLVMValueRef swizzle = LLVMConstVector(masks, count);
2523    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2524 }
2525 
2526 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2527 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2528                              unsigned bitwidth)
2529 {
2530    LLVMValueRef value = param;
2531    if (rshift)
2532       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2533 
2534    if (rshift + bitwidth < 32) {
2535       uint64_t mask = (1ull << bitwidth) - 1;
2536       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2537    }
2538 
2539    if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2540       value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2541    return value;
2542 }
2543 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2544 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2545                                        LLVMValueRef lane, bool with_opt_barrier)
2546 {
2547    LLVMTypeRef type = LLVMTypeOf(src);
2548    LLVMValueRef result;
2549 
2550    if (with_opt_barrier)
2551       ac_build_optimization_barrier(ctx, &src, false);
2552 
2553    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2554    if (lane)
2555       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2556 
2557    result =
2558       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2559                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2560 
2561    return LLVMBuildTrunc(ctx->builder, result, type, "");
2562 }
2563 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2564 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2565                                              LLVMValueRef lane, bool with_opt_barrier)
2566 {
2567    LLVMTypeRef src_type = LLVMTypeOf(src);
2568    src = ac_to_integer(ctx, src);
2569    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2570    LLVMValueRef ret;
2571 
2572    if (bits > 32) {
2573       assert(bits % 32 == 0);
2574       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2575       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2576       ret = LLVMGetUndef(vec_type);
2577       for (unsigned i = 0; i < bits / 32; i++) {
2578          LLVMValueRef ret_comp;
2579 
2580          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2581 
2582          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2583 
2584          ret =
2585             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2586       }
2587    } else {
2588       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2589    }
2590 
2591    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2592       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2593    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2594 }
2595 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2596 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2597 {
2598    return ac_build_readlane_common(ctx, src, lane, true);
2599 }
2600 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2601 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2602                                 LLVMValueRef lane)
2603 {
2604    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2605                              (LLVMValueRef[]){value, lane, src}, 3, 0);
2606 }
2607 
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2608 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2609 {
2610    LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2611    LLVMValueRef val;
2612 
2613    if (ctx->wave_size == 32) {
2614       if (LLVMTypeOf(mask) == ctx->i64)
2615          mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2616 
2617       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2618                                (LLVMValueRef[]){mask, add}, 2, 0);
2619    } else {
2620       LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2621       LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2622       LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2623       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2624                                (LLVMValueRef[]){mask_lo, add}, 2, 0);
2625       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2626                                2, 0);
2627    }
2628 
2629    if (add == ctx->i32_0)
2630       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2631 
2632    if (LLVM_VERSION_MAJOR < 16) {
2633       /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2634        * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2635        */
2636       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2637       val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2638    }
2639 
2640    return val;
2641 }
2642 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2643 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2644 {
2645    return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2646 }
2647 
2648 enum dpp_ctrl
2649 {
2650    _dpp_quad_perm = 0x000,
2651    _dpp_row_sl = 0x100,
2652    _dpp_row_sr = 0x110,
2653    _dpp_row_rr = 0x120,
2654    dpp_wf_sl1 = 0x130,
2655    dpp_wf_rl1 = 0x134,
2656    dpp_wf_sr1 = 0x138,
2657    dpp_wf_rr1 = 0x13C,
2658    dpp_row_mirror = 0x140,
2659    dpp_row_half_mirror = 0x141,
2660    dpp_row_bcast15 = 0x142,
2661    dpp_row_bcast31 = 0x143
2662 };
2663 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2664 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2665                                           unsigned lane3)
2666 {
2667    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2668    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2669 }
2670 
dpp_row_sr(unsigned amount)2671 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2672 {
2673    assert(amount > 0 && amount < 16);
2674    return _dpp_row_sr | amount;
2675 }
2676 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,bool use_wqm)2677 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2678                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2679                                   bool bound_ctrl, bool use_wqm)
2680 {
2681    LLVMTypeRef type = LLVMTypeOf(src);
2682    LLVMValueRef res;
2683 
2684    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2685    if (use_wqm)
2686       old = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &old, 1, 0);
2687    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2688    if (use_wqm)
2689       src = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &src, 1, 0);
2690 
2691    res = ac_build_intrinsic(
2692       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2693       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2694                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2695                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2696       6, 0);
2697 
2698    if (use_wqm)
2699       res = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &res, 1, 0);
2700 
2701    return LLVMBuildTrunc(ctx->builder, res, type, "");
2702 }
2703 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,bool use_wqm)2704 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2705                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2706                                  bool bound_ctrl, bool use_wqm)
2707 {
2708    LLVMTypeRef src_type = LLVMTypeOf(src);
2709    src = ac_to_integer(ctx, src);
2710    if (use_wqm)
2711       src = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &src, 1, 0);
2712    old = ac_to_integer(ctx, old);
2713    if (use_wqm)
2714       old = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.i32", ctx->i32, &old, 1, 0);
2715    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2716    LLVMValueRef ret;
2717    if (bits > 32) {
2718       assert(bits % 32 == 0);
2719       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2720       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2721       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2722       ret = LLVMGetUndef(vec_type);
2723       for (unsigned i = 0; i < bits / 32; i++) {
2724          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2725          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2726          LLVMValueRef ret_comp =
2727             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl, use_wqm);
2728          ret =
2729             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2730       }
2731    } else {
2732       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl, use_wqm);
2733    }
2734    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2735 }
2736 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2737 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2738                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
2739 {
2740    LLVMTypeRef type = LLVMTypeOf(src);
2741    LLVMValueRef result;
2742 
2743    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2744 
2745    LLVMValueRef args[6] = {
2746       src,
2747       src,
2748       LLVMConstInt(ctx->i32, sel, false),
2749       LLVMConstInt(ctx->i32, sel >> 32, false),
2750       ctx->i1true, /* fi */
2751       bound_ctrl ? ctx->i1true : ctx->i1false,
2752    };
2753 
2754    result =
2755       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2756                          ctx->i32, args, 6, 0);
2757 
2758    return LLVMBuildTrunc(ctx->builder, result, type, "");
2759 }
2760 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2761 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2762                                         bool exchange_rows, bool bound_ctrl)
2763 {
2764    LLVMTypeRef src_type = LLVMTypeOf(src);
2765    src = ac_to_integer(ctx, src);
2766    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2767    LLVMValueRef ret;
2768    if (bits > 32) {
2769       assert(bits % 32 == 0);
2770       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2771       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2772       ret = LLVMGetUndef(vec_type);
2773       for (unsigned i = 0; i < bits / 32; i++) {
2774          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2775          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2776          ret =
2777             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2778       }
2779    } else {
2780       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2781    }
2782    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2783 }
2784 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2785 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2786 {
2787    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2788    return and_mask | (or_mask << 5) | (xor_mask << 10);
2789 }
2790 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2791 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2792                                          unsigned mask)
2793 {
2794    LLVMTypeRef src_type = LLVMTypeOf(src);
2795    LLVMValueRef ret;
2796 
2797    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2798 
2799    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2800                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2801                             0);
2802 
2803    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2804 }
2805 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2806 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2807 {
2808    LLVMTypeRef src_type = LLVMTypeOf(src);
2809    src = ac_to_integer(ctx, src);
2810    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2811    LLVMValueRef ret;
2812    if (bits > 32) {
2813       assert(bits % 32 == 0);
2814       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2815       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2816       ret = LLVMGetUndef(vec_type);
2817       for (unsigned i = 0; i < bits / 32; i++) {
2818          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2819          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
2820          ret =
2821             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2822       }
2823    } else {
2824       ret = _ac_build_ds_swizzle(ctx, src, mask);
2825    }
2826    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2827 }
2828 
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)2829 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
2830 {
2831    LLVMTypeRef src_type = LLVMTypeOf(src);
2832    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2833    char name[32], type[8];
2834    LLVMValueRef ret;
2835 
2836    src = ac_to_integer(ctx, src);
2837 
2838    if (bitsize < 32)
2839       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2840 
2841    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2842    snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
2843    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
2844 
2845    if (bitsize < 32)
2846       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
2847 
2848    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2849 }
2850 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)2851 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
2852 {
2853    return ac_build_mode(ctx, src, "wwm");
2854 }
2855 
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)2856 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
2857 {
2858    return ac_build_mode(ctx, src, "wqm");
2859 }
2860 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)2861 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
2862                                           LLVMValueRef inactive)
2863 {
2864    char name[33], type[8];
2865    LLVMTypeRef src_type = LLVMTypeOf(src);
2866    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
2867    src = ac_to_integer(ctx, src);
2868    inactive = ac_to_integer(ctx, inactive);
2869 
2870    if (bitsize < 32) {
2871       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2872       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
2873    }
2874 
2875    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2876    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
2877    LLVMValueRef ret =
2878       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
2879    if (bitsize < 32)
2880       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2881 
2882    return ret;
2883 }
2884 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)2885 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
2886                                            unsigned type_size)
2887 {
2888 
2889    if (type_size == 0) {
2890       switch (op) {
2891       case nir_op_ior:
2892       case nir_op_ixor:
2893          return ctx->i1false;
2894       case nir_op_iand:
2895          return ctx->i1true;
2896       default:
2897          unreachable("bad reduction intrinsic");
2898       }
2899    } else if (type_size == 1) {
2900       switch (op) {
2901       case nir_op_iadd:
2902          return ctx->i8_0;
2903       case nir_op_imul:
2904          return ctx->i8_1;
2905       case nir_op_imin:
2906          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
2907       case nir_op_umin:
2908          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
2909       case nir_op_imax:
2910          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
2911       case nir_op_umax:
2912          return ctx->i8_0;
2913       case nir_op_iand:
2914          return LLVMConstInt(ctx->i8, -1, 0);
2915       case nir_op_ior:
2916          return ctx->i8_0;
2917       case nir_op_ixor:
2918          return ctx->i8_0;
2919       default:
2920          unreachable("bad reduction intrinsic");
2921       }
2922    } else if (type_size == 2) {
2923       switch (op) {
2924       case nir_op_iadd:
2925          return ctx->i16_0;
2926       case nir_op_fadd:
2927          return ctx->f16_0;
2928       case nir_op_imul:
2929          return ctx->i16_1;
2930       case nir_op_fmul:
2931          return ctx->f16_1;
2932       case nir_op_imin:
2933          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
2934       case nir_op_umin:
2935          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
2936       case nir_op_fmin:
2937          return LLVMConstReal(ctx->f16, INFINITY);
2938       case nir_op_imax:
2939          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
2940       case nir_op_umax:
2941          return ctx->i16_0;
2942       case nir_op_fmax:
2943          return LLVMConstReal(ctx->f16, -INFINITY);
2944       case nir_op_iand:
2945          return LLVMConstInt(ctx->i16, -1, 0);
2946       case nir_op_ior:
2947          return ctx->i16_0;
2948       case nir_op_ixor:
2949          return ctx->i16_0;
2950       default:
2951          unreachable("bad reduction intrinsic");
2952       }
2953    } else if (type_size == 4) {
2954       switch (op) {
2955       case nir_op_iadd:
2956          return ctx->i32_0;
2957       case nir_op_fadd:
2958          return ctx->f32_0;
2959       case nir_op_imul:
2960          return ctx->i32_1;
2961       case nir_op_fmul:
2962          return ctx->f32_1;
2963       case nir_op_imin:
2964          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
2965       case nir_op_umin:
2966          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
2967       case nir_op_fmin:
2968          return LLVMConstReal(ctx->f32, INFINITY);
2969       case nir_op_imax:
2970          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
2971       case nir_op_umax:
2972          return ctx->i32_0;
2973       case nir_op_fmax:
2974          return LLVMConstReal(ctx->f32, -INFINITY);
2975       case nir_op_iand:
2976          return LLVMConstInt(ctx->i32, -1, 0);
2977       case nir_op_ior:
2978          return ctx->i32_0;
2979       case nir_op_ixor:
2980          return ctx->i32_0;
2981       default:
2982          unreachable("bad reduction intrinsic");
2983       }
2984    } else { /* type_size == 64bit */
2985       switch (op) {
2986       case nir_op_iadd:
2987          return ctx->i64_0;
2988       case nir_op_fadd:
2989          return ctx->f64_0;
2990       case nir_op_imul:
2991          return ctx->i64_1;
2992       case nir_op_fmul:
2993          return ctx->f64_1;
2994       case nir_op_imin:
2995          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
2996       case nir_op_umin:
2997          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
2998       case nir_op_fmin:
2999          return LLVMConstReal(ctx->f64, INFINITY);
3000       case nir_op_imax:
3001          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3002       case nir_op_umax:
3003          return ctx->i64_0;
3004       case nir_op_fmax:
3005          return LLVMConstReal(ctx->f64, -INFINITY);
3006       case nir_op_iand:
3007          return LLVMConstInt(ctx->i64, -1, 0);
3008       case nir_op_ior:
3009          return ctx->i64_0;
3010       case nir_op_ixor:
3011          return ctx->i64_0;
3012       default:
3013          unreachable("bad reduction intrinsic");
3014       }
3015    }
3016 }
3017 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3018 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3019                                     nir_op op)
3020 {
3021    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3022    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3023    switch (op) {
3024    case nir_op_iadd:
3025       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3026    case nir_op_fadd:
3027       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3028    case nir_op_imul:
3029       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3030    case nir_op_fmul:
3031       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3032    case nir_op_imin:
3033       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3034                              lhs, rhs, "");
3035    case nir_op_umin:
3036       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3037                              lhs, rhs, "");
3038    case nir_op_fmin:
3039       return ac_build_intrinsic(
3040          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3041          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3042    case nir_op_imax:
3043       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3044                              lhs, rhs, "");
3045    case nir_op_umax:
3046       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3047                              lhs, rhs, "");
3048    case nir_op_fmax:
3049       return ac_build_intrinsic(
3050          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3051          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3052    case nir_op_iand:
3053       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3054    case nir_op_ior:
3055       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3056    case nir_op_ixor:
3057       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3058    default:
3059       unreachable("bad reduction intrinsic");
3060    }
3061 }
3062 
3063 /**
3064  * \param src The value to shift.
3065  * \param identity The value to use the first lane.
3066  * \param maxprefix specifies that the result only needs to be correct for a
3067  *     prefix of this many threads
3068  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3069  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3070 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3071                                                LLVMValueRef identity, unsigned maxprefix)
3072 {
3073    if (ctx->gfx_level >= GFX10) {
3074       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3075       LLVMValueRef active, tmp1, tmp2;
3076       LLVMValueRef tid = ac_get_thread_id(ctx);
3077 
3078       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false, false);
3079 
3080       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3081 
3082       if (maxprefix > 32) {
3083          active =
3084             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3085 
3086          tmp2 = LLVMBuildSelect(ctx->builder, active,
3087                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3088                                 tmp2, "");
3089 
3090          active = LLVMBuildOr(
3091             ctx->builder, active,
3092             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3093                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3094                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3095             "");
3096          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3097       } else if (maxprefix > 16) {
3098          active =
3099             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3100 
3101          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3102       }
3103    } else if (ctx->gfx_level >= GFX8) {
3104       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false, false);
3105    }
3106 
3107    /* wavefront shift_right by 1 on SI/CI */
3108    LLVMValueRef active, tmp1, tmp2;
3109    LLVMValueRef tid = ac_get_thread_id(ctx);
3110    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3111    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3112    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3113                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3114                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3115    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3116    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3117    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3118                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3119                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3120    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3121    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3122    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3123                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3124                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3125    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3126    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3127    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3128    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3129    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3130    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3131 }
3132 
3133 /**
3134  * \param maxprefix specifies that the result only needs to be correct for a
3135  *     prefix of this many threads
3136  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3137 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3138                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3139 {
3140    LLVMValueRef result, tmp;
3141 
3142    if (!inclusive)
3143       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3144 
3145    result = src;
3146 
3147    if (ctx->gfx_level <= GFX7) {
3148       assert(maxprefix == 64);
3149       LLVMValueRef tid = ac_get_thread_id(ctx);
3150       LLVMValueRef active;
3151       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3152       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3153                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3154       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3155       result = ac_build_alu_op(ctx, result, tmp, op);
3156       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3157       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3158                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3159                              ctx->i32_0, "");
3160       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3161       result = ac_build_alu_op(ctx, result, tmp, op);
3162       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3163       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3164                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3165                              ctx->i32_0, "");
3166       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3167       result = ac_build_alu_op(ctx, result, tmp, op);
3168       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3169       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3170                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3171                              ctx->i32_0, "");
3172       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3173       result = ac_build_alu_op(ctx, result, tmp, op);
3174       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3175       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3176                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3177                              ctx->i32_0, "");
3178       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3179       result = ac_build_alu_op(ctx, result, tmp, op);
3180       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3181       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3182                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3183                              ctx->i32_0, "");
3184       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3185       result = ac_build_alu_op(ctx, result, tmp, op);
3186       return result;
3187    }
3188 
3189    if (maxprefix <= 1)
3190       return result;
3191    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false, false);
3192    result = ac_build_alu_op(ctx, result, tmp, op);
3193    if (maxprefix <= 2)
3194       return result;
3195    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false, false);
3196    result = ac_build_alu_op(ctx, result, tmp, op);
3197    if (maxprefix <= 3)
3198       return result;
3199    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false, false);
3200    result = ac_build_alu_op(ctx, result, tmp, op);
3201    if (maxprefix <= 4)
3202       return result;
3203    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false, false);
3204    result = ac_build_alu_op(ctx, result, tmp, op);
3205    if (maxprefix <= 8)
3206       return result;
3207    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false, false);
3208    result = ac_build_alu_op(ctx, result, tmp, op);
3209    if (maxprefix <= 16)
3210       return result;
3211 
3212    if (ctx->gfx_level >= GFX10) {
3213       LLVMValueRef tid = ac_get_thread_id(ctx);
3214       LLVMValueRef active;
3215 
3216       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3217 
3218       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3219                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3220                              ctx->i32_0, "");
3221 
3222       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3223 
3224       result = ac_build_alu_op(ctx, result, tmp, op);
3225 
3226       if (maxprefix <= 32)
3227          return result;
3228 
3229       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3230 
3231       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3232 
3233       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3234 
3235       result = ac_build_alu_op(ctx, result, tmp, op);
3236       return result;
3237    }
3238 
3239    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false, false);
3240    result = ac_build_alu_op(ctx, result, tmp, op);
3241    if (maxprefix <= 32)
3242       return result;
3243    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false, false);
3244    result = ac_build_alu_op(ctx, result, tmp, op);
3245    return result;
3246 }
3247 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3248 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3249 {
3250    LLVMValueRef result;
3251 
3252    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3253       LLVMBuilderRef builder = ctx->builder;
3254       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3255       result = ac_build_ballot(ctx, src);
3256       result = ac_build_mbcnt(ctx, result);
3257       result = LLVMBuildAdd(builder, result, src, "");
3258       return result;
3259    }
3260 
3261    ac_build_optimization_barrier(ctx, &src, false);
3262 
3263    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3264    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3265                              LLVMTypeOf(identity), "");
3266    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3267 
3268    return ac_build_wwm(ctx, result);
3269 }
3270 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3271 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3272 {
3273    LLVMValueRef result;
3274 
3275    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3276       LLVMBuilderRef builder = ctx->builder;
3277       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3278       result = ac_build_ballot(ctx, src);
3279       result = ac_build_mbcnt(ctx, result);
3280       return result;
3281    }
3282 
3283    ac_build_optimization_barrier(ctx, &src, false);
3284 
3285    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3286    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3287                              LLVMTypeOf(identity), "");
3288    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3289 
3290    return ac_build_wwm(ctx, result);
3291 }
3292 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3293 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3294                              unsigned cluster_size)
3295 {
3296    if (cluster_size == 1)
3297       return src;
3298    ac_build_optimization_barrier(ctx, &src, false);
3299    LLVMValueRef result, swap;
3300    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3301    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3302                              LLVMTypeOf(identity), "");
3303    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2, false);
3304    result = ac_build_alu_op(ctx, result, swap, op);
3305    if (cluster_size == 2)
3306       return ac_build_wwm(ctx, result);
3307 
3308    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1, false);
3309    result = ac_build_alu_op(ctx, result, swap, op);
3310    if (cluster_size == 4)
3311       return ac_build_wwm(ctx, result);
3312 
3313    if (ctx->gfx_level >= GFX8)
3314       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false, false);
3315    else
3316       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3317    result = ac_build_alu_op(ctx, result, swap, op);
3318    if (cluster_size == 8)
3319       return ac_build_wwm(ctx, result);
3320 
3321    if (ctx->gfx_level >= GFX8)
3322       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false, false);
3323    else
3324       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3325    result = ac_build_alu_op(ctx, result, swap, op);
3326    if (cluster_size == 16)
3327       return ac_build_wwm(ctx, result);
3328 
3329    if (ctx->gfx_level >= GFX10)
3330       swap = ac_build_permlane16(ctx, result, 0, true, false);
3331    else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3332       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false, false);
3333    else
3334       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3335    result = ac_build_alu_op(ctx, result, swap, op);
3336    if (cluster_size == 32)
3337       return ac_build_wwm(ctx, result);
3338 
3339    if (ctx->gfx_level >= GFX8) {
3340       if (ctx->wave_size == 64) {
3341          if (ctx->gfx_level >= GFX10)
3342             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3343          else
3344             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false, false);
3345          result = ac_build_alu_op(ctx, result, swap, op);
3346          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3347       }
3348 
3349       return ac_build_wwm(ctx, result);
3350    } else {
3351       swap = ac_build_readlane(ctx, result, ctx->i32_0);
3352       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3353       result = ac_build_alu_op(ctx, result, swap, op);
3354       return ac_build_wwm(ctx, result);
3355    }
3356 }
3357 
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3358 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3359                                              LLVMValueRef *arg0, LLVMValueRef *arg1)
3360 {
3361    LLVMValueRef tid;
3362    LLVMValueRef src0, src1;
3363    LLVMValueRef tmp0;
3364    LLVMValueRef params[2];
3365    LLVMValueRef is_even;
3366 
3367    src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3368    src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3369 
3370    /* swap odd,even lanes of arg_0*/
3371    params[0] = src0;
3372    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3373    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3374                              ctx->i32, params, 2, 0);
3375 
3376    /* swap even lanes between arg_0 and arg_1 */
3377    tid = ac_get_thread_id(ctx);
3378    is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3379                            LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3380                            ctx->i32_0, "");
3381    tmp0 = src0;
3382    src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3383    src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3384 
3385    /* swap odd,even lanes again for arg_0*/
3386    params[0] = src0;
3387    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3388    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3389                              ctx->i32, params, 2, 0);
3390 
3391    *arg0 = src0;
3392    *arg1 = src1;
3393 }
3394 
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3395 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3396                                      struct ac_export_args *mrt0,
3397                                      struct ac_export_args *mrt1)
3398 {
3399    assert(ctx->gfx_level >= GFX11);
3400    assert(mrt0->enabled_channels == mrt1->enabled_channels);
3401 
3402    for (int i = 0; i < 4; i++) {
3403       if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3404          _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3405    }
3406 }
3407 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3,bool use_wqm)3408 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3409                                    unsigned lane1, unsigned lane2, unsigned lane3,
3410                                    bool use_wqm)
3411 {
3412    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3413    if (ctx->gfx_level >= GFX8) {
3414       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false, use_wqm);
3415    } else {
3416       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3417    }
3418 }
3419 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3420 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3421 {
3422    LLVMTypeRef type = LLVMTypeOf(src);
3423    LLVMValueRef result;
3424 
3425    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3426    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3427 
3428    result =
3429       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3430    return LLVMBuildTrunc(ctx->builder, result, type, "");
3431 }
3432 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3433 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3434 {
3435    LLVMTypeRef type;
3436    char *intr;
3437 
3438    if (bitsize == 16) {
3439       intr = "llvm.amdgcn.frexp.exp.i16.f16";
3440       type = ctx->i16;
3441    } else if (bitsize == 32) {
3442       intr = "llvm.amdgcn.frexp.exp.i32.f32";
3443       type = ctx->i32;
3444    } else {
3445       intr = "llvm.amdgcn.frexp.exp.i32.f64";
3446       type = ctx->i32;
3447    }
3448 
3449    LLVMValueRef params[] = {
3450       src0,
3451    };
3452    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3453 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3454 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3455 {
3456    LLVMTypeRef type;
3457    char *intr;
3458 
3459    if (bitsize == 16) {
3460       intr = "llvm.amdgcn.frexp.mant.f16";
3461       type = ctx->f16;
3462    } else if (bitsize == 32) {
3463       intr = "llvm.amdgcn.frexp.mant.f32";
3464       type = ctx->f32;
3465    } else {
3466       intr = "llvm.amdgcn.frexp.mant.f64";
3467       type = ctx->f64;
3468    }
3469 
3470    LLVMValueRef params[] = {
3471       src0,
3472    };
3473    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3474 }
3475 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3476 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3477 {
3478    LLVMTypeRef type;
3479    char *intr;
3480 
3481    if (bitsize == 16) {
3482       intr = "llvm.canonicalize.f16";
3483       type = ctx->f16;
3484    } else if (bitsize == 32) {
3485       intr = "llvm.canonicalize.f32";
3486       type = ctx->f32;
3487    } else {
3488       intr = "llvm.canonicalize.f64";
3489       type = ctx->f64;
3490    }
3491 
3492    LLVMValueRef params[] = {
3493       src0,
3494    };
3495    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3496 }
3497 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3498 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3499 {
3500    LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3501 
3502    return LLVMBuildNot(ctx->builder, result, "");
3503 }
3504 
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3505 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3506                            unsigned num_args)
3507 {
3508    LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3509    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3510    return ret;
3511 }
3512 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3513 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3514                      LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3515                      struct ac_export_args *args)
3516 {
3517    unsigned mask = 0;
3518    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3519                                                 mrt0_alpha != NULL);
3520 
3521    assert(depth || stencil || samplemask || mrt0_alpha);
3522 
3523    memset(args, 0, sizeof(*args));
3524 
3525    if (is_last) {
3526       args->valid_mask = 1; /* whether the EXEC mask is valid */
3527       args->done = 1;       /* DONE bit */
3528    }
3529 
3530    /* Specify the target we are exporting */
3531    args->target = V_008DFC_SQ_EXP_MRTZ;
3532 
3533    args->compr = 0;                       /* COMP flag */
3534    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3535    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3536    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3537    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3538 
3539    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3540       assert(!depth);
3541       args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3542 
3543       if (stencil) {
3544          /* Stencil should be in X[23:16]. */
3545          stencil = ac_to_integer(ctx, stencil);
3546          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3547          args->out[0] = ac_to_float(ctx, stencil);
3548          mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3549       }
3550       if (samplemask) {
3551          /* SampleMask should be in Y[15:0]. */
3552          args->out[1] = samplemask;
3553          mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3554       }
3555    } else {
3556       if (depth) {
3557          args->out[0] = depth;
3558          mask |= 0x1;
3559       }
3560       if (stencil) {
3561          assert(format == V_028710_SPI_SHADER_32_GR ||
3562                 format == V_028710_SPI_SHADER_32_ABGR);
3563          args->out[1] = stencil;
3564          mask |= 0x2;
3565       }
3566       if (samplemask) {
3567          assert(format == V_028710_SPI_SHADER_32_ABGR);
3568          args->out[2] = samplemask;
3569          mask |= 0x4;
3570       }
3571       if (mrt0_alpha) {
3572          assert(format == V_028710_SPI_SHADER_32_AR ||
3573                 format == V_028710_SPI_SHADER_32_ABGR);
3574          if (format == V_028710_SPI_SHADER_32_AR && ctx->gfx_level >= GFX10) {
3575             args->out[1] = mrt0_alpha;
3576             mask |= 0x2;
3577          } else {
3578             args->out[3] = mrt0_alpha;
3579             mask |= 0x8;
3580          }
3581       }
3582    }
3583 
3584    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3585     * at the X writemask component. */
3586    if (ctx->gfx_level == GFX6 &&
3587        ctx->info->family != CHIP_OLAND &&
3588        ctx->info->family != CHIP_HAINAN)
3589       mask |= 0x1;
3590 
3591    /* Specify which components to enable */
3592    args->enabled_channels = mask;
3593 }
3594 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3595 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3596 {
3597    LLVMTypeRef base;
3598    switch (type) {
3599       case AC_ARG_FLOAT:
3600          return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3601       case AC_ARG_INT:
3602          return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3603       case AC_ARG_CONST_PTR:
3604          base = ctx->i8;
3605          break;
3606       case AC_ARG_CONST_FLOAT_PTR:
3607          base = ctx->f32;
3608          break;
3609       case AC_ARG_CONST_PTR_PTR:
3610          base = ac_array_in_const32_addr_space(ctx->i8);
3611          break;
3612       case AC_ARG_CONST_DESC_PTR:
3613          base = ctx->v4i32;
3614          break;
3615       case AC_ARG_CONST_IMAGE_PTR:
3616          base = ctx->v8i32;
3617          break;
3618       default:
3619          assert(false);
3620          return NULL;
3621    }
3622 
3623    assert(base);
3624    if (size == 1) {
3625       return ac_array_in_const32_addr_space(base);
3626    } else {
3627       assert(size == 2);
3628       return ac_array_in_const_addr_space(base);
3629    }
3630 }
3631 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3632 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3633                            enum ac_llvm_calling_convention convention, const char *name,
3634                            LLVMTypeRef ret_type, LLVMModuleRef module)
3635 {
3636    LLVMTypeRef arg_types[AC_MAX_ARGS];
3637    enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3638 
3639    /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3640     * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3641     */
3642    unsigned arg_count = 0;
3643    for (unsigned i = 0; i < args->arg_count; i++) {
3644       if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3645          ctx->ring_offsets_index = i;
3646          continue;
3647       }
3648       arg_regfiles[arg_count] = args->args[i].file;
3649       arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3650    }
3651 
3652    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3653 
3654    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3655    LLVMBasicBlockRef main_function_body =
3656       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3657    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3658 
3659    LLVMSetFunctionCallConv(main_function, convention);
3660    for (unsigned i = 0; i < arg_count; ++i) {
3661       LLVMValueRef P = LLVMGetParam(main_function, i);
3662 
3663       if (arg_regfiles[i] != AC_ARG_SGPR)
3664          continue;
3665 
3666       ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3667 
3668       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3669          ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3670          ac_add_attr_dereferenceable(P, UINT64_MAX);
3671          ac_add_attr_alignment(P, 4);
3672       }
3673    }
3674 
3675    if (args->ring_offsets.used) {
3676       ctx->ring_offsets =
3677          ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3678                             LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3679       ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3680                                            ac_array_in_const_addr_space(ctx->v4i32), "");
3681    }
3682 
3683    ctx->main_function = (struct ac_llvm_pointer) {
3684       .value = main_function,
3685       .pointee_type = main_function_type
3686    };
3687 
3688    /* Enable denormals for FP16 and FP64: */
3689    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3690    /* Disable denormals for FP32: */
3691    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3692                                       "preserve-sign,preserve-sign");
3693 
3694    if (convention == AC_LLVM_AMDGPU_PS) {
3695       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3696                                          ctx->exports_mrtz ? "1" : "0");
3697       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3698                                          ctx->exports_color_null ? "1" : "0");
3699    }
3700 
3701    return ctx->main_function;
3702 }
3703 
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3704 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3705 {
3706    LLVMValueRef args[2] = {
3707       a,
3708       LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3709    };
3710    return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3711 }
3712